Посмотрим на готовые признаки и сделаем первую посылку.
import os
import json
import pandas as pd
import numpy as np
import datetime
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
%matplotlib inline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import RocCurveDisplay
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import resample
SEED = 10801
sns.set_style(style="whitegrid")
plt.rcParams["figure.figsize"] = 12, 8
warnings.filterwarnings("ignore")
Файлы:
sample_submission.csv: пример файла-посылкиtrain_raw_data.jsonl, test_raw_data.jsonl: "сырые" данные train_data.csv, test_data.csv: признаки, созданные авторамиtrain_targets.csv: результаты тренировочных игрНабор простых признаков, описывающих игроков и команды в целом
PATH_TO_DATA = "../data"
df_train_features = pd.read_csv(os.path.join(PATH_TO_DATA,
"train_data.csv"),
index_col="match_id_hash")
df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA,
"train_targets.csv"),
index_col="match_id_hash")
df_train_features.shape
(31698, 245)
df_train_features.head()
| game_time | game_mode | lobby_type | objectives_len | chat_len | r1_hero_id | r1_kills | r1_deaths | r1_assists | r1_denies | ... | d5_stuns | d5_creeps_stacked | d5_camps_stacked | d5_rune_pickups | d5_firstblood_claimed | d5_teamfight_participation | d5_towers_killed | d5_roshans_killed | d5_obs_placed | d5_sen_placed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| match_id_hash | |||||||||||||||||||||
| b9c57c450ce74a2af79c9ce96fac144d | 658 | 4 | 0 | 3 | 10 | 15 | 7 | 2 | 0 | 7 | ... | 0.000000 | 0 | 0 | 0 | 0 | 0.00 | 0 | 0 | 0 | 0 |
| 6db558535151ea18ca70a6892197db41 | 21 | 23 | 0 | 0 | 0 | 101 | 0 | 0 | 0 | 0 | ... | 0.000000 | 0 | 0 | 0 | 0 | 0.00 | 0 | 0 | 0 | 0 |
| 19c39fe2af2b547e48708ca005c6ae74 | 160 | 22 | 7 | 0 | 0 | 57 | 0 | 0 | 0 | 1 | ... | 0.000000 | 0 | 0 | 0 | 0 | 0.00 | 0 | 0 | 0 | 0 |
| c96d629dc0c39f0c616d1949938a6ba6 | 1016 | 22 | 0 | 1 | 0 | 119 | 0 | 3 | 3 | 5 | ... | 8.264696 | 0 | 0 | 3 | 0 | 0.25 | 0 | 0 | 3 | 0 |
| 156c88bff4e9c4668b0f53df3d870f1b | 582 | 22 | 7 | 2 | 2 | 12 | 3 | 1 | 2 | 9 | ... | 15.762911 | 3 | 1 | 0 | 1 | 0.50 | 0 | 0 | 3 | 0 |
5 rows × 245 columns
Имеем ~32 тысячи наблюдений, каждое из которых характеризуется уникальным match_id_hash (захэшированное id матча), и 245 признаков. game_time показывает момент времени, в который получены эти данные. То есть по сути это не длительность самого матча, а например, его середина, таким образом, в итоге мы сможем получить модель, которая будет предсказывать вероятность победы каждой из команд в течение матча (хорошо подходит для букмекеров).
Нас интересует поле radiant_win (так называется одна из команд, вторая - dire). Остальные колоки здесь по сути получены из "будущего" и есть только для тренировочных данных, поэтому на них можно просто посмотреть).
df_train_targets.head()
| game_time | radiant_win | duration | time_remaining | next_roshan_team | |
|---|---|---|---|---|---|
| match_id_hash | |||||
| b9c57c450ce74a2af79c9ce96fac144d | 658 | True | 1154 | 496 | NaN |
| 6db558535151ea18ca70a6892197db41 | 21 | True | 1503 | 1482 | Radiant |
| 19c39fe2af2b547e48708ca005c6ae74 | 160 | False | 2063 | 1903 | NaN |
| c96d629dc0c39f0c616d1949938a6ba6 | 1016 | True | 2147 | 1131 | Radiant |
| 156c88bff4e9c4668b0f53df3d870f1b | 582 | False | 1927 | 1345 | Dire |
X = df_train_features.values
y = df_train_targets["radiant_win"].values.astype("int8")
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
test_size=0.3,
random_state=SEED)
%%time
rf_model = RandomForestClassifier(n_estimators=300, max_depth=7, n_jobs=-1, random_state=SEED)
rf_model.fit(X_train, y_train)
CPU times: user 30.6 s, sys: 62.5 ms, total: 30.6 s Wall time: 4.86 s
RandomForestClassifier(max_depth=7, n_estimators=300, n_jobs=-1,
random_state=10801)
y_pred = rf_model.predict_proba(X_valid)[:, 1]
valid_score = roc_auc_score(y_valid, y_pred)
print("ROC-AUC score на отложенной части:", valid_score)
ROC-AUC score на отложенной части: 0.7754387258058622
Посмотрим на accuracy:
valid_accuracy = accuracy_score(y_valid, y_pred > 0.5)
print("Accuracy score (p > 0.5) на отложенной части:", valid_accuracy)
Accuracy score (p > 0.5) на отложенной части: 0.6885383806519453
df_test_features = pd.read_csv(os.path.join(PATH_TO_DATA, "test_data.csv"),
index_col="match_id_hash")
X_test = df_test_features.values
y_test_pred = rf_model.predict_proba(X_test)[:, 1]
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
index=df_test_features.index)
submission_filename = "submission_{}.csv".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))
Файл посылки сохранен, как: submission_2023-04-09_18-13-59.csv
Во многих случаях кросс-валидация оказывается лучше простого разбиения на test и train. Воспользуемся ShuffleSplit чтобы создать 5 70%/30% наборов данных.
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=SEED)
%%time
rf_model = RandomForestClassifier(n_estimators=300, max_depth=7, n_jobs=-1, random_state=SEED)
cv_scores_rf = cross_val_score(rf_model, X, y, cv=cv, scoring="roc_auc")
CPU times: user 8.36 s, sys: 675 ms, total: 9.04 s Wall time: 31.8 s
cv_scores_rf
array([0.77543873, 0.77343884, 0.76649967, 0.7722681 , 0.77246001])
print(f"Среднее значение ROC-AUC на кросс-валидации: {cv_scores_rf.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.7720210676055513
Описание сырых данных можно найти в train_matches.jsonl и test_matches.jsonl. Каждый файл содержит одну запись для каждого матча в JSON формате. Его легко превратить в питоновский объект при помощи метода json.loads.
with open(os.path.join(PATH_TO_DATA, "train_raw_data.jsonl")) as fin:
# прочтем 419 строку
for i in range(419):
line = fin.readline()
# переведем JSON в питоновский словарь
match = json.loads(line)
player = match["players"][9]
player["kills"], player["deaths"], player["assists"]
(0, 5, 5)
match.keys()
dict_keys(['game_time', 'match_id_hash', 'teamfights', 'objectives', 'chat', 'game_mode', 'lobby_type', 'players', 'targets'])
player.keys()
dict_keys(['player_slot', 'hero_id', 'hero_name', 'account_id_hash', 'ability_upgrades', 'obs_placed', 'sen_placed', 'creeps_stacked', 'camps_stacked', 'rune_pickups', 'firstblood_claimed', 'teamfight_participation', 'towers_killed', 'roshans_killed', 'observers_placed', 'stuns', 'max_hero_hit', 'times', 'gold_t', 'lh_t', 'dn_t', 'xp_t', 'obs_log', 'sen_log', 'obs_left_log', 'sen_left_log', 'purchase_log', 'kills_log', 'buyback_log', 'runes_log', 'obs', 'sen', 'actions', 'pings', 'purchase', 'gold_reasons', 'xp_reasons', 'killed', 'item_uses', 'ability_uses', 'hero_hits', 'damage', 'damage_taken', 'damage_inflictor', 'runes', 'killed_by', 'kill_streaks', 'multi_kills', 'life_state', 'healing', 'damage_inflictor_received', 'randomed', 'pred_vict', 'gold', 'lh', 'xp', 'x', 'y', 'hero_inventory', 'hero_stash', 'health', 'max_health', 'max_mana', 'level', 'kills', 'deaths', 'assists', 'denies', 'nearby_creep_death_count'])
player['nearby_creep_death_count']
164
sum(list(player['ability_uses'].values()))
62
sum(list(player["purchase"].values()))
36
for ab in player.keys():
print(ab)
player_slot hero_id hero_name account_id_hash ability_upgrades obs_placed sen_placed creeps_stacked camps_stacked rune_pickups firstblood_claimed teamfight_participation towers_killed roshans_killed observers_placed stuns max_hero_hit times gold_t lh_t dn_t xp_t obs_log sen_log obs_left_log sen_left_log purchase_log kills_log buyback_log runes_log obs sen actions pings purchase gold_reasons xp_reasons killed item_uses ability_uses hero_hits damage damage_taken damage_inflictor runes killed_by kill_streaks multi_kills life_state healing damage_inflictor_received randomed pred_vict gold lh xp x y hero_inventory hero_stash health max_health max_mana level kills deaths assists denies nearby_creep_death_count
KDA - может быть неплохим признаком, этот показатель считается как:
Информация о количестве использованных способностей:
player["ability_uses"]
{'treant_living_armor': 52, 'treant_leech_seed': 5, 'treant_overgrowth': 5}
for i, player in enumerate(match["players"]):
plt.plot(player["times"], player["xp_t"], label=str(i+1))
plt.legend()
plt.xlabel("Time, s")
plt.ylabel("XP")
plt.title("XP change for all players");
В этот раз для чтение json файлов лучше использовать библиотеку ujson, иначе все будет слишком долго :(
try:
import ujson as json
except ModuleNotFoundError:
import json
print ("Подумайте об установке ujson, чтобы работать с JSON объектами быстрее")
try:
from tqdm.notebook import tqdm
except ModuleNotFoundError:
tqdm_notebook = lambda x: x
print ("Подумайте об установке tqdm, чтобы следить за прогрессом")
def read_matches(matches_file, total_matches=31698, n_matches_to_read=None):
"""
Аргуент
-------
matches_file: JSON файл с сырыми данными
Результат
---------
Возвращает записи о каждом матче
"""
if n_matches_to_read is None:
n_matches_to_read = total_matches
c = 0
with open(matches_file) as fin:
for line in tqdm(fin, total=total_matches):
if c >= n_matches_to_read:
break
else:
c += 1
yield json.loads(line)
Чтение всех данных занимает 1-2 минуты, поэтому для начала можно попробовать следующее:
pickle файл, чтобы в следующий раз не переделывать все зановоНапишем функцию, которая поможет нам легче добавлять новые признаки.
So, I supposed that the following features might be useful:
def add_new_features(df_features, matches_file):
"""
Аргуенты
-------
df_features: таблица с данными
matches_file: JSON файл с сырыми данными
Результат
---------
Добавляет новые признаки в таблицу
"""
for match in read_matches(matches_file):
match_id_hash = match['match_id_hash']
# Посчитаем количество разрушенных вышек обеими командами
radiant_tower_kills = 0
dire_tower_kills = 0
for objective in match["objectives"]:
if objective["type"] == "CHAT_MESSAGE_TOWER_KILL":
if objective["team"] == 2:
radiant_tower_kills += 1
if objective["team"] == 3:
dire_tower_kills += 1
df_features.loc[match_id_hash, "radiant_tower_kills"] = radiant_tower_kills
df_features.loc[match_id_hash, "dire_tower_kills"] = dire_tower_kills
df_features.loc[match_id_hash, "diff_tower_kills"] = radiant_tower_kills - dire_tower_kills
for player_id in range(10):
player = match["players"][player_id]
ability_upgr_n = len(player["ability_upgrades"])
ability_uses_n = sum(list(player['ability_uses'].values()))
purchase_n = sum(list(player["purchase"].values()))
item_use_n = sum(list(player["item_uses"].values()))
dmg = sum(list(player["damage"].values()))
dmg_taken = sum(list(player["damage_taken"].values()))
nearby_creep_death_count = player["nearby_creep_death_count"]
pred_vict = player["pred_vict"]
player_num = f"r{player_id + 1}" if player_id <= 4 else f"d{player_id - 4}"
df_features.loc[match_id_hash, f"{player_num}_ability_upgrades"] = ability_upgr_n
df_features.loc[match_id_hash, f"{player_num}_ability_uses"] = ability_uses_n
df_features.loc[match_id_hash, f"{player_num}_purchases"] = purchase_n
df_features.loc[match_id_hash, f"{player_num}_item_uses"] = item_use_n
df_features.loc[match_id_hash, f"{player_num}_damage_given"] = dmg
df_features.loc[match_id_hash, f"{player_num}_damage_taken"] = dmg_taken
df_features.loc[match_id_hash, f"{player_num}_nearby_creep_death_count"] = nearby_creep_death_count
df_features.loc[match_id_hash, f"{player_num}_pred_vict"] = pred_vict
# ... (/¯◡ ‿ ◡)/¯☆*:・゚ добавляем новые признаки ...
# Скопируем таблицу с признаками
df_train_features_extended = df_train_features.copy()
df_test_features_extended = df_test_features.copy()
# Добавим новые
add_new_features(df_train_features_extended,
os.path.join(PATH_TO_DATA,
"train_raw_data.jsonl"))
add_new_features(df_test_features_extended,
os.path.join(PATH_TO_DATA,
"test_raw_data.jsonl"))
0%| | 0/31698 [00:00<?, ?it/s]
0%| | 0/31698 [00:00<?, ?it/s]
df_train_features_extended.to_csv("df_train_extended.tsv", sep="\t")
df_test_features_extended.to_csv("df_test_extended.tsv", sep="\t")
df_train_features_extended.head()
| game_time | game_mode | lobby_type | objectives_len | chat_len | r1_hero_id | r1_kills | r1_deaths | r1_assists | r1_denies | ... | d4_purchases | d4_item_uses | d4_damage_given | d4_damage_taken | d5_ability_upgrades | d5_ability_uses | d5_purchases | d5_item_uses | d5_damage_given | d5_damage_taken | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| match_id_hash | |||||||||||||||||||||
| b9c57c450ce74a2af79c9ce96fac144d | 658 | 4 | 0 | 3 | 10 | 15 | 7 | 2 | 0 | 7 | ... | 24.0 | 15.0 | 7437.0 | 5893.0 | 4.0 | 7.0 | 12.0 | 9.0 | 2308.0 | 2154.0 |
| 6db558535151ea18ca70a6892197db41 | 21 | 23 | 0 | 0 | 0 | 101 | 0 | 0 | 0 | 0 | ... | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 1.0 | 0.0 | 0.0 |
| 19c39fe2af2b547e48708ca005c6ae74 | 160 | 22 | 7 | 0 | 0 | 57 | 0 | 0 | 0 | 1 | ... | 10.0 | 1.0 | 1250.0 | 1040.0 | 0.0 | 2.0 | 9.0 | 2.0 | 4496.0 | 948.0 |
| c96d629dc0c39f0c616d1949938a6ba6 | 1016 | 22 | 0 | 1 | 0 | 119 | 0 | 3 | 3 | 5 | ... | 10.0 | 20.0 | 35511.0 | 9456.0 | 5.0 | 32.0 | 21.0 | 14.0 | 4682.0 | 1608.0 |
| 156c88bff4e9c4668b0f53df3d870f1b | 582 | 22 | 7 | 2 | 2 | 12 | 3 | 1 | 2 | 9 | ... | 19.0 | 19.0 | 9070.0 | 2818.0 | 2.0 | 19.0 | 14.0 | 8.0 | 2156.0 | 1593.0 |
5 rows × 308 columns
df_test_features_extended.head()
| game_time | game_mode | lobby_type | objectives_len | chat_len | r1_hero_id | r1_kills | r1_deaths | r1_assists | r1_denies | ... | d4_purchases | d4_item_uses | d4_damage_given | d4_damage_taken | d5_ability_upgrades | d5_ability_uses | d5_purchases | d5_item_uses | d5_damage_given | d5_damage_taken | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| match_id_hash | |||||||||||||||||||||
| a400b8f29dece5f4d266f49f1ae2e98a | 155 | 22 | 7 | 1 | 11 | 11 | 0 | 0 | 0 | 0 | ... | 9.0 | 4.0 | 868.0 | 16.0 | 0.0 | 4.0 | 6.0 | 3.0 | 2332.0 | 681.0 |
| 46a0ddce8f7ed2a8d9bd5edcbb925682 | 576 | 22 | 7 | 1 | 4 | 14 | 1 | 0 | 3 | 1 | ... | 19.0 | 7.0 | 6010.0 | 4201.0 | 3.0 | 19.0 | 13.0 | 11.0 | 3955.0 | 3317.0 |
| b1b35ff97723d9b7ade1c9c3cf48f770 | 453 | 22 | 7 | 1 | 3 | 42 | 0 | 1 | 1 | 0 | ... | 9.0 | 7.0 | 8388.0 | 3160.0 | 2.0 | 9.0 | 7.0 | 6.0 | 10739.0 | 2785.0 |
| ab3cc6ccac661a1385e73a2e9f21313a | 721 | 4 | 0 | 2 | 1 | 30 | 2 | 2 | 1 | 3 | ... | 19.0 | 13.0 | 5053.0 | 3237.0 | 7.0 | 7.0 | 14.0 | 10.0 | 15255.0 | 5052.0 |
| 54aaab1cb8cc5df3c253641618673266 | 752 | 22 | 7 | 1 | 0 | 8 | 2 | 0 | 2 | 8 | ... | 15.0 | 16.0 | 21681.0 | 3230.0 | 5.0 | 64.0 | 23.0 | 11.0 | 16086.0 | 5782.0 |
5 rows × 308 columns
%%time
cv_scores_base = cross_val_score(rf_model, X, y, cv=cv, scoring="roc_auc", n_jobs=-1)
cv_scores_extended = cross_val_score(rf_model, df_train_features_extended.values, y,
cv=cv, scoring="roc_auc", n_jobs=-1)
CPU times: user 103 ms, sys: 228 ms, total: 331 ms Wall time: 58.6 s
print(f"ROC-AUC на кросс-валидации для базовых признаков: {cv_scores_base.mean()}")
print(f"ROC-AUC на кросс-валидации для новых признаков: {cv_scores_extended.mean()}")
ROC-AUC на кросс-валидации для базовых признаков: 0.7720210676055513 ROC-AUC на кросс-валидации для новых признаков: 0.778629346381704
Видно, что случайный лес стал работать немного лучше при добавлении новых признаков. A еще нужно, наверное, как-то по-умному закодировать категориальные признаки.
Дальше дело за малым. Добавляйте новые признаки, пробуйте другие методы, которые мы изучили, а также что-то интересное, что мы не прошли. Удачи!
Количество игр в зависимости от времени игры распределено неравномерно, возможно, это связано с тем, что игры заканчиваются в разное время, и длинных игра не так много.
sns.histplot(df_train_features, x="game_time")
;
''
sns.heatmap(df_train_features.corr())
;
''
Посмотрим на корреляции признаков для одного игрока, видим, что некоторые из них довольно сильно коррелируют
r1_ftrs_lst = [ftr for ftr in df_train_features.columns if ftr.startswith("r1")]
sns.heatmap(df_train_features[r1_ftrs_lst].corr() > 0.5, annot=True, fmt=".2f")
<AxesSubplot: >
Если наложить для всех 10 игроков корреляции признаков друг на друга, то будет виден определенный паттерн, эти признаки надо будет как-то объединить\трансформировать.
list(df_train_features_extended.columns)
['game_time', 'game_mode', 'lobby_type', 'objectives_len', 'chat_len', 'r1_hero_id', 'r1_kills', 'r1_deaths', 'r1_assists', 'r1_denies', 'r1_gold', 'r1_lh', 'r1_xp', 'r1_health', 'r1_max_health', 'r1_max_mana', 'r1_level', 'r1_x', 'r1_y', 'r1_stuns', 'r1_creeps_stacked', 'r1_camps_stacked', 'r1_rune_pickups', 'r1_firstblood_claimed', 'r1_teamfight_participation', 'r1_towers_killed', 'r1_roshans_killed', 'r1_obs_placed', 'r1_sen_placed', 'r2_hero_id', 'r2_kills', 'r2_deaths', 'r2_assists', 'r2_denies', 'r2_gold', 'r2_lh', 'r2_xp', 'r2_health', 'r2_max_health', 'r2_max_mana', 'r2_level', 'r2_x', 'r2_y', 'r2_stuns', 'r2_creeps_stacked', 'r2_camps_stacked', 'r2_rune_pickups', 'r2_firstblood_claimed', 'r2_teamfight_participation', 'r2_towers_killed', 'r2_roshans_killed', 'r2_obs_placed', 'r2_sen_placed', 'r3_hero_id', 'r3_kills', 'r3_deaths', 'r3_assists', 'r3_denies', 'r3_gold', 'r3_lh', 'r3_xp', 'r3_health', 'r3_max_health', 'r3_max_mana', 'r3_level', 'r3_x', 'r3_y', 'r3_stuns', 'r3_creeps_stacked', 'r3_camps_stacked', 'r3_rune_pickups', 'r3_firstblood_claimed', 'r3_teamfight_participation', 'r3_towers_killed', 'r3_roshans_killed', 'r3_obs_placed', 'r3_sen_placed', 'r4_hero_id', 'r4_kills', 'r4_deaths', 'r4_assists', 'r4_denies', 'r4_gold', 'r4_lh', 'r4_xp', 'r4_health', 'r4_max_health', 'r4_max_mana', 'r4_level', 'r4_x', 'r4_y', 'r4_stuns', 'r4_creeps_stacked', 'r4_camps_stacked', 'r4_rune_pickups', 'r4_firstblood_claimed', 'r4_teamfight_participation', 'r4_towers_killed', 'r4_roshans_killed', 'r4_obs_placed', 'r4_sen_placed', 'r5_hero_id', 'r5_kills', 'r5_deaths', 'r5_assists', 'r5_denies', 'r5_gold', 'r5_lh', 'r5_xp', 'r5_health', 'r5_max_health', 'r5_max_mana', 'r5_level', 'r5_x', 'r5_y', 'r5_stuns', 'r5_creeps_stacked', 'r5_camps_stacked', 'r5_rune_pickups', 'r5_firstblood_claimed', 'r5_teamfight_participation', 'r5_towers_killed', 'r5_roshans_killed', 'r5_obs_placed', 'r5_sen_placed', 'd1_hero_id', 'd1_kills', 'd1_deaths', 'd1_assists', 'd1_denies', 'd1_gold', 'd1_lh', 'd1_xp', 'd1_health', 'd1_max_health', 'd1_max_mana', 'd1_level', 'd1_x', 'd1_y', 'd1_stuns', 'd1_creeps_stacked', 'd1_camps_stacked', 'd1_rune_pickups', 'd1_firstblood_claimed', 'd1_teamfight_participation', 'd1_towers_killed', 'd1_roshans_killed', 'd1_obs_placed', 'd1_sen_placed', 'd2_hero_id', 'd2_kills', 'd2_deaths', 'd2_assists', 'd2_denies', 'd2_gold', 'd2_lh', 'd2_xp', 'd2_health', 'd2_max_health', 'd2_max_mana', 'd2_level', 'd2_x', 'd2_y', 'd2_stuns', 'd2_creeps_stacked', 'd2_camps_stacked', 'd2_rune_pickups', 'd2_firstblood_claimed', 'd2_teamfight_participation', 'd2_towers_killed', 'd2_roshans_killed', 'd2_obs_placed', 'd2_sen_placed', 'd3_hero_id', 'd3_kills', 'd3_deaths', 'd3_assists', 'd3_denies', 'd3_gold', 'd3_lh', 'd3_xp', 'd3_health', 'd3_max_health', 'd3_max_mana', 'd3_level', 'd3_x', 'd3_y', 'd3_stuns', 'd3_creeps_stacked', 'd3_camps_stacked', 'd3_rune_pickups', 'd3_firstblood_claimed', 'd3_teamfight_participation', 'd3_towers_killed', 'd3_roshans_killed', 'd3_obs_placed', 'd3_sen_placed', 'd4_hero_id', 'd4_kills', 'd4_deaths', 'd4_assists', 'd4_denies', 'd4_gold', 'd4_lh', 'd4_xp', 'd4_health', 'd4_max_health', 'd4_max_mana', 'd4_level', 'd4_x', 'd4_y', 'd4_stuns', 'd4_creeps_stacked', 'd4_camps_stacked', 'd4_rune_pickups', 'd4_firstblood_claimed', 'd4_teamfight_participation', 'd4_towers_killed', 'd4_roshans_killed', 'd4_obs_placed', 'd4_sen_placed', 'd5_hero_id', 'd5_kills', 'd5_deaths', 'd5_assists', 'd5_denies', 'd5_gold', 'd5_lh', 'd5_xp', 'd5_health', 'd5_max_health', 'd5_max_mana', 'd5_level', 'd5_x', 'd5_y', 'd5_stuns', 'd5_creeps_stacked', 'd5_camps_stacked', 'd5_rune_pickups', 'd5_firstblood_claimed', 'd5_teamfight_participation', 'd5_towers_killed', 'd5_roshans_killed', 'd5_obs_placed', 'd5_sen_placed', 'radiant_tower_kills', 'dire_tower_kills', 'diff_tower_kills', 'd5_ability_upgrades', 'd5_ability_uses', 'd5_purchases', 'd5_item_uses', 'd5_damage_given', 'd5_damage_taken']
for cmnd in ["r", "d"]:
for plr in range(1, 6):
plr_id = f"{cmnd}{plr}"
plr_ftrs_lst = [ftr for ftr in df_train_features.columns if ftr.startswith(plr_id)]
sns.heatmap(df_train_features[plr_ftrs_lst].corr() > 0.6, alpha=0.1, cbar=False)
sns.scatterplot(df_train_features[["d5_deaths", "d5_assists"]][1:100], x="d5_deaths", y="d5_assists")
<AxesSubplot: xlabel='d5_deaths', ylabel='d5_assists'>
PATH_TO_DATA = "../data"
df_train_features = pd.read_csv(os.path.join(PATH_TO_DATA,
"train_data.csv"),
index_col="match_id_hash")
df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA,
"train_targets.csv"),
index_col="match_id_hash")
df_test_features = pd.read_csv(os.path.join(PATH_TO_DATA, "test_data.csv"),
index_col="match_id_hash")
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=SEED)
%%time
# base model
X = df_train_features.values
y = df_train_targets["radiant_win"].values.astype("int8")
rf_model = RandomForestClassifier(n_estimators=300, max_depth=7, n_jobs=-1, random_state=SEED)
cv_scores_rf = cross_val_score(rf_model, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {cv_scores_rf.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.7720210676055513 CPU times: user 7.28 s, sys: 510 ms, total: 7.79 s Wall time: 31.8 s
Ввожу KDA + преобразую first_blood_claim в категориальную переменную для команды, тк он может случиться один раз за игру, 1 для radiant, 0 для dire.
for team in ["r", "d"]:
for player in range(1, 6):
df_train_features[f"{team}{player}_kda"] = (df_train_features[f"{team}{player}_kills"] + df_train_features[f"{team}{player}_assists"]) / (df_train_features[f"{team}{player}_deaths"] + 1)
drop_list = []
for team in ["r", "d"]:
for player in range(1, 6):
drop_list.append(f"{team}{player}_kills")
drop_list.append(f"{team}{player}_assists")
drop_list.append(f"{team}{player}_deaths")
df_train_features = df_train_features.drop(drop_list, axis=1)
rad_first_blood_features_list = ["r1_firstblood_claimed", "r2_firstblood_claimed", "r3_firstblood_claimed", "r4_firstblood_claimed", "r5_firstblood_claimed"]
dire_first_blood_features_list = ["d1_firstblood_claimed", "d2_firstblood_claimed", "d3_firstblood_claimed", "d4_firstblood_claimed", "d5_firstblood_claimed"]
df_train_features["first_blood_claimed_radiant"] = df_train_features[rad_first_blood_features_list].sum(axis=1)
df_train_features = df_train_features.drop(rad_first_blood_features_list + dire_first_blood_features_list, axis=1)
for team in ["r", "d"]:
for player in range(1, 6):
df_test_features[f"{team}{player}_kda"] = (df_test_features[f"{team}{player}_kills"] + df_test_features[f"{team}{player}_assists"]) / (df_test_features[f"{team}{player}_deaths"] + 1)
drop_list = []
for team in ["r", "d"]:
for player in range(1, 6):
drop_list.append(f"{team}{player}_kills")
drop_list.append(f"{team}{player}_assists")
drop_list.append(f"{team}{player}_deaths")
df_test_features = df_test_features.drop(drop_list, axis=1)
rad_first_blood_features_list = ["r1_firstblood_claimed", "r2_firstblood_claimed", "r3_firstblood_claimed", "r4_firstblood_claimed", "r5_firstblood_claimed"]
dire_first_blood_features_list = ["d1_firstblood_claimed", "d2_firstblood_claimed", "d3_firstblood_claimed", "d4_firstblood_claimed", "d5_firstblood_claimed"]
df_test_features["first_blood_claimed_radiant"] = df_test_features[rad_first_blood_features_list].sum(axis=1)
df_test_features = df_test_features.drop(rad_first_blood_features_list + dire_first_blood_features_list, axis=1)
%%time
# kda and first blood claim by team
X = df_train_features.values
y = df_train_targets["radiant_win"].values.astype("int8")
rf_model = RandomForestClassifier(n_estimators=300, max_depth=7, n_jobs=-1, random_state=SEED)
cv_scores_rf = cross_val_score(rf_model, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {cv_scores_rf.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.7776936265657336 CPU times: user 7.66 s, sys: 561 ms, total: 8.22 s Wall time: 25.3 s
for cmnd in ["r", "d"]:
for plr in range(1, 6):
plr_id = f"{cmnd}{plr}"
plr_ftrs_lst = [ftr for ftr in df_train_features.columns if ftr.startswith(plr_id)]
sns.heatmap(df_train_features[plr_ftrs_lst].corr() > 0.6, alpha=0.1, cbar=False)
r1_ftrs_lst = [ftr for ftr in df_train_features.columns if ftr.startswith("r1")]
sns.heatmap(df_train_features[r1_ftrs_lst].corr(), annot=True, fmt=".2f")
<AxesSubplot: >
rf_model = RandomForestClassifier(n_estimators=300, max_depth=7, n_jobs=-1, random_state=SEED)
rf_model.fit(X, y)
X_test = df_test_features.values
rf_model
y_test_pred = rf_model.predict_proba(X_test)[:, 1]
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
index=df_test_features.index)
submission_filename = "submission_{}.csv".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))
Файл посылки сохранен, как: submission_2023-04-05_23-17-55.csv
OKay, let's try to sum up hero features per team; same features within different heroes do not corralste, but I will calculate their sums or meand to evaluate the team score
plr_ftrs_lst = [ftr[3:] for ftr in df_train_features.columns if ftr.startswith("r1")]
fig, axes = plt.subplots(7, 3, figsize=(15, 35))
axes = axes.ravel()
for feature, axis in enumerate(axes):
r_feature = [f"r{plr}_{plr_ftrs_lst[feature]}" for plr in range(1, 6)]
d_feature = [f"d{plr}_{plr_ftrs_lst[feature]}" for plr in range(1, 6)]
sns.heatmap(df_train_features[r_feature + d_feature].corr(), ax=axis)
Best Random Forest (actually overfit, but I'll deal with it later, now I apply it only to understand whether my feature transformations are good or not)
%%time
X = df_train_features.values
y = df_train_targets["radiant_win"].values.astype("int8")
rf_model = RandomForestClassifier(n_estimators=700, max_depth=19, n_jobs=-1, random_state=SEED)
cv_scores_rf = cross_val_score(rf_model, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {cv_scores_rf.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.787386574335213 CPU times: user 22.8 s, sys: 1.68 s, total: 24.5 s Wall time: 2min 3s
best model yet
rf_model = RandomForestClassifier(n_estimators=700, max_depth=19, n_jobs=-1, random_state=SEED)
rf_model.fit(X, y)
X_test = df_test_features.values
rf_model
y_test_pred = rf_model.predict_proba(X_test)[:, 1]
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
index=df_test_features.index)
submission_filename = "submission_{}.csv".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))
Файл посылки сохранен, как: submission_2023-04-06_00-11-19.csv
This will range hero id to a more applicable parameter
plrs = []
for tm in ["r", "d"]:
for plr in range(1, 6):
plrs.append(f"{tm}{plr}_hero_id")
plrs.append("radiant_win")
heroes_df = pd.concat([df_train_features, df_train_targets], axis=1)[plrs]
plrs.remove("radiant_win")
df_hero_count = pd.DataFrame(heroes_df.r1_hero_id.value_counts())\
.reset_index()\
.rename(columns={"index": "hero_id", "r1_hero_id": "games_total"})
for plr in plrs[1:]:
intermed_df = pd.DataFrame(heroes_df[plr].value_counts())\
.reset_index()\
.rename(columns={"index": "hero_id", str(plr): "games_total"})
df_hero_count = pd.merge(df_hero_count, intermed_df, on="hero_id")
df_hero_count["games_total"] = df_hero_count.drop("hero_id", axis=1).sum(axis=1)
df_hero_count = df_hero_count[["hero_id", "games_total"]]
df_hero_count.head()
| hero_id | games_total | |
|---|---|---|
| 0 | 14 | 13207 |
| 1 | 11 | 9187 |
| 2 | 32 | 7628 |
| 3 | 8 | 7270 |
| 4 | 74 | 6392 |
rad_win = heroes_df[heroes_df.radiant_win == True][['r1_hero_id', 'r2_hero_id', 'r3_hero_id', 'r4_hero_id', 'r5_hero_id']]
dire_win = heroes_df[heroes_df.radiant_win == False][['d1_hero_id', 'd2_hero_id', 'd3_hero_id', 'd4_hero_id', 'd5_hero_id']]
rad_win = rad_win.rename(columns={'r1_hero_id': "hero_1", 'r2_hero_id': "hero_2", 'r3_hero_id': "hero_3", 'r4_hero_id': "hero_4", 'r5_hero_id': "hero_5"})
dire_win = dire_win.rename(columns={'d1_hero_id': "hero_1", 'd2_hero_id': "hero_2", 'd3_hero_id': "hero_3", 'd4_hero_id': "hero_4", 'd5_hero_id': "hero_5"})
hero_win_df = pd.concat([rad_win, dire_win], axis=0)
hero_win_df.head()
| hero_1 | hero_2 | hero_3 | hero_4 | hero_5 | |
|---|---|---|---|---|---|
| match_id_hash | |||||
| b9c57c450ce74a2af79c9ce96fac144d | 15 | 96 | 27 | 63 | 89 |
| 6db558535151ea18ca70a6892197db41 | 101 | 51 | 44 | 49 | 53 |
| c96d629dc0c39f0c616d1949938a6ba6 | 119 | 71 | 44 | 35 | 108 |
| a3c3892648b873bb869dd81ca0f62286 | 8 | 91 | 32 | 12 | 69 |
| 5feece770ca79e5e8cd8052198b3f533 | 103 | 86 | 23 | 29 | 8 |
df_win_count = pd.DataFrame(hero_win_df.hero_1.value_counts())\
.reset_index()\
.rename(columns={"index": "hero_id", "hero_1": "wins"})
for plr in range(2, 6):
intermed_df = pd.DataFrame(hero_win_df[f"hero_{plr}"].value_counts())\
.reset_index()\
.rename(columns={"index": "hero_id", f"hero_{plr}": "wins"})
df_win_count = pd.merge(df_win_count, intermed_df, on="hero_id")
df_win_count["win_count"] = df_win_count.drop("hero_id", axis=1).sum(axis=1)
df_win_count = df_win_count[["hero_id", "win_count"]]
df_win_count.head()
| hero_id | win_count | |
|---|---|---|
| 0 | 14 | 6659 |
| 1 | 11 | 4352 |
| 2 | 32 | 4415 |
| 3 | 8 | 3788 |
| 4 | 35 | 2857 |
df_games_wins = pd.merge(df_hero_count, df_win_count, on="hero_id")
df_games_wins["win_prop"] = df_games_wins.win_count / df_games_wins.games_total
df_games_wins = df_games_wins.sort_values("hero_id")
win_proportion_dict = dict(zip(df_games_wins.hero_id, df_games_wins.win_prop))
df_games_wins.head()
| hero_id | games_total | win_count | win_prop | |
|---|---|---|---|---|
| 46 | 1 | 3020 | 1428 | 0.472848 |
| 30 | 2 | 3187 | 1601 | 0.502353 |
| 76 | 3 | 1754 | 896 | 0.510832 |
| 56 | 4 | 2290 | 1166 | 0.509170 |
| 8 | 5 | 5272 | 2734 | 0.518589 |
for plr in plrs:
df_train_features[plr] = df_train_features[plr].apply(lambda x: win_proportion_dict[x])
df_test_features[plr] = df_test_features[plr].apply(lambda x: win_proportion_dict[x])
df_train_features.head()
| game_time | game_mode | lobby_type | objectives_len | chat_len | r1_hero_id | r1_denies | r1_gold | r1_lh | r1_xp | ... | r2_kda | r3_kda | r4_kda | r5_kda | d1_kda | d2_kda | d3_kda | d4_kda | d5_kda | first_blood_claimed_radiant | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| match_id_hash | |||||||||||||||||||||
| b9c57c450ce74a2af79c9ce96fac144d | 658 | 4 | 0 | 3 | 10 | 0.479358 | 7 | 5257 | 52 | 3937 | ... | 2.500000 | 2.5 | 7.0 | 6.000000 | 0.333333 | 0.142857 | 0.500000 | 0.5 | 0.000000 | 0 |
| 6db558535151ea18ca70a6892197db41 | 21 | 23 | 0 | 0 | 0 | 0.522029 | 0 | 176 | 0 | 0 | ... | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0 |
| 19c39fe2af2b547e48708ca005c6ae74 | 160 | 22 | 7 | 0 | 0 | 0.532962 | 1 | 403 | 0 | 359 | ... | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0 |
| c96d629dc0c39f0c616d1949938a6ba6 | 1016 | 22 | 0 | 1 | 0 | 0.503966 | 5 | 3085 | 1 | 2828 | ... | 1.333333 | 1.0 | 2.0 | 1.333333 | 0.800000 | 4.000000 | 2.333333 | 1.5 | 1.500000 | 1 |
| 156c88bff4e9c4668b0f53df3d870f1b | 582 | 22 | 7 | 2 | 2 | 0.505334 | 9 | 3516 | 40 | 3964 | ... | 0.428571 | 2.0 | 0.5 | 0.000000 | 3.000000 | 0.666667 | 5.000000 | 1.0 | 1.666667 | 0 |
5 rows × 216 columns
df_test_features.head()
| game_time | game_mode | lobby_type | objectives_len | chat_len | r1_hero_id | r1_denies | r1_gold | r1_lh | r1_xp | ... | r2_kda | r3_kda | r4_kda | r5_kda | d1_kda | d2_kda | d3_kda | d4_kda | d5_kda | first_blood_claimed_radiant | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| match_id_hash | |||||||||||||||||||||
| a400b8f29dece5f4d266f49f1ae2e98a | 155 | 22 | 7 | 1 | 11 | 0.473713 | 0 | 543 | 7 | 533 | ... | 0.0 | 0.000000 | 0.0 | 0.000000 | 1.000000 | 0.00 | 0.0 | 1.000000 | 0.000000 | 0 |
| 46a0ddce8f7ed2a8d9bd5edcbb925682 | 576 | 22 | 7 | 1 | 4 | 0.504202 | 1 | 1613 | 0 | 1471 | ... | 2.0 | 2.000000 | 1.5 | 1.000000 | 0.000000 | 0.25 | 0.5 | 0.000000 | 0.000000 | 1 |
| b1b35ff97723d9b7ade1c9c3cf48f770 | 453 | 22 | 7 | 1 | 3 | 0.545796 | 0 | 1404 | 9 | 1351 | ... | 1.0 | 0.000000 | 2.0 | 0.333333 | 1.000000 | 1.00 | 1.0 | 1.000000 | 0.500000 | 1 |
| ab3cc6ccac661a1385e73a2e9f21313a | 721 | 4 | 0 | 2 | 1 | 0.504766 | 3 | 2306 | 6 | 3088 | ... | 1.0 | 0.333333 | 1.5 | 1.000000 | 1.166667 | 2.00 | 2.0 | 5.000000 | 0.333333 | 0 |
| 54aaab1cb8cc5df3c253641618673266 | 752 | 22 | 7 | 1 | 0 | 0.521045 | 8 | 3917 | 46 | 5081 | ... | 1.0 | 1.500000 | 2.5 | 1.000000 | 0.000000 | 0.50 | 0.0 | 0.666667 | 0.750000 | 1 |
5 rows × 216 columns
Originally Ancients are placed at the coordinates (284, 5200) and (5200, 284), but here we have other min and max x and y values. I will suppose that (min(x), min(y)) and (max(x), max(y)) are the coordinates of Ancients.
Ancient radiant: (66, 70); Ancient dire: (188, 186).
Euclidean distance is used.
df_train_features
| game_time | game_mode | lobby_type | objectives_len | chat_len | r1_hero_id | r1_denies | r1_gold | r1_lh | r1_xp | ... | r2_kda | r3_kda | r4_kda | r5_kda | d1_kda | d2_kda | d3_kda | d4_kda | d5_kda | first_blood_claimed_radiant | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| match_id_hash | |||||||||||||||||||||
| b9c57c450ce74a2af79c9ce96fac144d | 658 | 4 | 0 | 3 | 10 | 0.479358 | 7 | 5257 | 52 | 3937 | ... | 2.500000 | 2.500000 | 7.000000 | 6.000000 | 0.333333 | 0.142857 | 0.500000 | 0.500000 | 0.000000 | 0 |
| 6db558535151ea18ca70a6892197db41 | 21 | 23 | 0 | 0 | 0 | 0.522029 | 0 | 176 | 0 | 0 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 |
| 19c39fe2af2b547e48708ca005c6ae74 | 160 | 22 | 7 | 0 | 0 | 0.532962 | 1 | 403 | 0 | 359 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 |
| c96d629dc0c39f0c616d1949938a6ba6 | 1016 | 22 | 0 | 1 | 0 | 0.503966 | 5 | 3085 | 1 | 2828 | ... | 1.333333 | 1.000000 | 2.000000 | 1.333333 | 0.800000 | 4.000000 | 2.333333 | 1.500000 | 1.500000 | 1 |
| 156c88bff4e9c4668b0f53df3d870f1b | 582 | 22 | 7 | 2 | 2 | 0.505334 | 9 | 3516 | 40 | 3964 | ... | 0.428571 | 2.000000 | 0.500000 | 0.000000 | 3.000000 | 0.666667 | 5.000000 | 1.000000 | 1.666667 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| b794aa55646841a03c7783226e6f6bc8 | 2666 | 22 | 0 | 23 | 8 | 0.530687 | 3 | 19850 | 251 | 25448 | ... | 1.818182 | 1.666667 | 2.857143 | 2.428571 | 1.352941 | 3.444444 | 13.000000 | 4.666667 | 3.200000 | 0 |
| 308faee28efee2e66b39f9f2ba6ea9cf | 2525 | 22 | 0 | 15 | 5 | 0.572719 | 3 | 13914 | 151 | 19592 | ... | 1.571429 | 5.000000 | 15.000000 | 2.500000 | 1.750000 | 1.142857 | 1.200000 | 0.250000 | 0.625000 | 0 |
| 6066cc7417b43c749d551e123d00f0c8 | 1002 | 4 | 0 | 4 | 0 | 0.472848 | 0 | 4613 | 59 | 4478 | ... | 0.600000 | 0.428571 | 1.000000 | 1.250000 | 8.000000 | 5.000000 | 3.333333 | 2.400000 | 5.000000 | 0 |
| bc7a87ed5f9c2bca55f9f7a93da0b0c5 | 377 | 22 | 7 | 1 | 0 | 0.504202 | 1 | 809 | 2 | 1102 | ... | 0.500000 | 1.000000 | 2.000000 | 0.000000 | 0.500000 | 1.000000 | 0.500000 | 0.000000 | 0.000000 | 1 |
| e2ca68ac1a6847f4a37f6c9c8ee8695b | 643 | 22 | 7 | 1 | 23 | 0.503371 | 2 | 1747 | 4 | 2677 | ... | 0.500000 | 0.090909 | 1.000000 | 0.000000 | 4.000000 | 1.000000 | 2.500000 | 6.000000 | 10.000000 | 0 |
31698 rows × 216 columns
plrs_x = []
plrs_y = []
for tm in ["r", "d"]:
for plr in range(1, 6):
plrs_x.append(f"{tm}{plr}_x")
plrs_y.append(f"{tm}{plr}_y")
x_min = min(df_train_features[plrs_x].min())
y_min = min(df_train_features[plrs_y].min())
x_max = max(df_train_features[plrs_x].max())
y_max = max(df_train_features[plrs_y].max())
print(f"Coordinates of the Ancient: ({x_min}, {y_min}); ({x_max}, {y_max}).")
Coordinates of the Ancient: (66, 70); (188, 186).
ancient_coord = (x_min, y_min)
plrs = []
for tm in ["r", "d"]:
for plr in range(1, 6):
plrs.append(f"{tm}{plr}")
def dists(x1, y1, x2, y2):
return np.sqrt((np.square(x2 - x1) + np.square(y2 - y1)))
def distance_to_ancient(df, x2, y2):
for plr in plrs:
x1 = df[f"{plr}_x"]
y1 = df[f"{plr}_y"]
df[f"{plr}_dist_to_anc"] = dists(x1, y1, x2, y2)
drop_lst = [f"{i}_x" for i in plrs] + [f"{i}_y" for i in plrs]
df = df.drop(drop_lst, axis=1)
return df
df_train_features = distance_to_ancient(df_train_features, ancient_coord[0], ancient_coord[1])
df_test_features = distance_to_ancient(df_test_features, ancient_coord[0], ancient_coord[1])
def mean_ftrs_by_plr(df, ftr):
r_ftr_lst = [f"r{plr}_{ftr}" for plr in range(1, 6)]
d_ftr_lst = [f"d{plr}_{ftr}" for plr in range(1, 6)]
df[f'r_{ftr}'] = df[r_ftr_lst].mean(axis=1)
df[f'd_{ftr}'] = df[d_ftr_lst].mean(axis=1)
df = df.drop(r_ftr_lst + d_ftr_lst, axis=1)
return (df)
def sum_up_ftrs_by_plr(df, ftr):
r_ftr_lst = [f"r{plr}_{ftr}" for plr in range(1, 6)]
d_ftr_lst = [f"d{plr}_{ftr}" for plr in range(1, 6)]
df[f'r_{ftr}'] = df[r_ftr_lst].sum(axis=1)
df[f'd_{ftr}'] = df[d_ftr_lst].sum(axis=1)
df = df.drop(r_ftr_lst + d_ftr_lst, axis=1)
return (df)
Hero ids will be counted as a asum as well because now they have values of hero success and refer to the team.
plr_ftrs_lst = [ftr[3:] for ftr in df_train_features.columns if ftr.startswith("r1")]
for feature in plr_ftrs_lst:
df_train_features = sum_up_ftrs_by_plr(df=df_train_features, ftr=feature)
df_test_features = sum_up_ftrs_by_plr(df=df_test_features, ftr=feature)
%%time
X = df_train_features.values
y = df_train_targets["radiant_win"].values.astype("int8")
rf_model = RandomForestClassifier(n_estimators=700, max_depth=19, n_jobs=-1, random_state=SEED)
cv_scores_rf = cross_val_score(rf_model, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {cv_scores_rf.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.819087237079156 CPU times: user 15.4 s, sys: 1.05 s, total: 16.5 s Wall time: 1min 10s
Gold proportion to evaluate gold disbalance between two teams
df_train_features["r_gold_prop"] = df_train_features.r_gold / (df_train_features.r_gold + df_train_features.d_gold)
df_test_features["r_gold_prop"] = df_test_features.r_gold / (df_test_features.r_gold + df_test_features.d_gold)
df_train_features = df_train_features.drop(["r_gold", "d_gold"], axis=1)
df_test_features = df_test_features.drop(["r_gold", "d_gold"], axis=1)
Lots of features correlate with game time
sns.heatmap(df_train_features.corr() > 0.6)
;
''
Shall we normalize lh, xp, health, max_health, max_mana, level, stuns, rune_pickups, towers_killed, obs_placed and sen_placed per game time? YES (it ameliorated the model)
per_time_norm_list = ['r_lh', 'd_lh',
'r_xp', 'd_xp', 'r_health', 'd_health', 'r_max_health', 'd_max_health',
'r_max_mana', 'd_max_mana', 'r_level', 'd_level', 'r_stuns', 'd_stuns',
'r_rune_pickups', 'd_rune_pickups', 'r_towers_killed', 'd_towers_killed',
'r_obs_placed', 'd_obs_placed', 'r_sen_placed', 'd_sen_placed']
for ftr in per_time_norm_list:
df_train_features[f"{ftr}_per_time"] = df_train_features[f"{ftr}"] / (df_train_features["game_time"] + 1)
df_test_features[f"{ftr}_per_time"] = df_test_features[f"{ftr}"] / (df_test_features["game_time"] + 1)
df_train_features = df_train_features.drop(f"{ftr}", axis=1)
df_test_features = df_test_features.drop(f"{ftr}", axis=1)
sns.heatmap(df_train_features.drop(drop_list, axis=1).corr() > 0.8)
;
''
Shall I count lh per time, xp per time, health per time, max health per time, max mana per time, level per time, stuns per time, runes pickups per time, obs placed per time, sen placed per time as proportion as well and hence evaluate the disbalance between teams? YES (it ameloirated the model)
disbalance = ['lh_per_time',
'xp_per_time',
'health_per_time',
'max_health_per_time',
'max_mana_per_time',
'level_per_time',
'stuns_per_time',
'rune_pickups_per_time',
'obs_placed_per_time',
'sen_placed_per_time']
for ftr in disbalance:
df_train_features[f"r_{ftr}_prop"] = df_train_features[f"r_{ftr}"] / (df_train_features[f"r_{ftr}"] + df_train_features[f"d_{ftr}"] + 1)
df_test_features[f"r_{ftr}_prop"] = df_test_features[f"r_{ftr}"] / (df_test_features[f"r_{ftr}"] + df_test_features[f"d_{ftr}"] + 1)
df_train_features = df_train_features.drop([f"r_{ftr}", f"d_{ftr}"], axis=1)
df_test_features = df_test_features.drop([f"r_{ftr}", f"d_{ftr}"], axis=1)
sns.heatmap(df_train_features.drop(drop_list, axis=1).corr() > 0.6)
;
''
sns.heatmap(df_train_features.corr())
<AxesSubplot: >
Upsampling + standardation
df_train_features.head()
| game_time | game_mode | lobby_type | objectives_len | chat_len | first_blood_claimed_radiant | r_hero_id | d_hero_id | r_denies | d_denies | ... | r_lh_per_time_prop | r_xp_per_time_prop | r_health_per_time_prop | r_max_health_per_time_prop | r_max_mana_per_time_prop | r_level_per_time_prop | r_stuns_per_time_prop | r_rune_pickups_per_time_prop | r_obs_placed_per_time_prop | r_sen_placed_per_time_prop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| match_id_hash | |||||||||||||||||||||
| b9c57c450ce74a2af79c9ce96fac144d | 658 | 4 | 0 | 3 | 10 | 0 | 2.531378 | 2.589738 | 28 | 16 | ... | 0.143991 | 0.563917 | 0.531239 | 0.501016 | 0.455913 | 0.050964 | 0.059694 | 0.018950 | 0.005988 | 0.004518 |
| 6db558535151ea18ca70a6892197db41 | 21 | 23 | 0 | 0 | 0 | 0 | 2.504680 | 2.587266 | 0 | 0 | ... | 0.000000 | 0.000000 | 0.496823 | 0.496699 | 0.509453 | 0.156250 | 0.000000 | 0.115385 | 0.000000 | 0.000000 |
| 19c39fe2af2b547e48708ca005c6ae74 | 160 | 22 | 7 | 0 | 0 | 0 | 2.492869 | 2.624720 | 9 | 5 | ... | 0.090909 | 0.523860 | 0.580584 | 0.504202 | 0.479748 | 0.065217 | 0.042986 | 0.041667 | 0.006173 | 0.000000 |
| c96d629dc0c39f0c616d1949938a6ba6 | 1016 | 22 | 0 | 1 | 0 | 1 | 2.560871 | 2.397790 | 33 | 38 | ... | 0.153901 | 0.479421 | 0.455123 | 0.458762 | 0.434607 | 0.043088 | 0.055981 | 0.030160 | 0.005825 | 0.001961 |
| 156c88bff4e9c4668b0f53df3d870f1b | 582 | 22 | 7 | 2 | 2 | 0 | 2.393454 | 2.600217 | 37 | 44 | ... | 0.168862 | 0.479118 | 0.474057 | 0.489113 | 0.451197 | 0.045242 | 0.008406 | 0.014901 | 0.003396 | 0.000000 |
5 rows × 35 columns
df_train_features.columns
Index(['game_time', 'game_mode', 'lobby_type', 'objectives_len', 'chat_len',
'first_blood_claimed_radiant', 'r_hero_id', 'd_hero_id', 'r_denies',
'd_denies', 'r_creeps_stacked', 'd_creeps_stacked', 'r_camps_stacked',
'd_camps_stacked', 'r_teamfight_participation',
'd_teamfight_participation', 'r_roshans_killed', 'd_roshans_killed',
'r_kda', 'd_kda', 'r_dist_to_anc', 'd_dist_to_anc', 'r_gold_prop',
'r_towers_killed_per_time', 'd_towers_killed_per_time',
'r_lh_per_time_prop', 'r_xp_per_time_prop', 'r_health_per_time_prop',
'r_max_health_per_time_prop', 'r_max_mana_per_time_prop',
'r_level_per_time_prop', 'r_stuns_per_time_prop',
'r_rune_pickups_per_time_prop', 'r_obs_placed_per_time_prop',
'r_sen_placed_per_time_prop'],
dtype='object')
num_cols = df_train_features.columns.tolist()
num_cols.remove('first_blood_claimed_radiant')
cat_cols = ['first_blood_claimed_radiant']
preprocessor = ColumnTransformer(
transformers=[("scaler", StandardScaler(), num_cols),
("ohe", OneHotEncoder(drop="first"), cat_cols)]
)
df_train_targets.radiant_win.value_counts()
True 16670 False 15028 Name: radiant_win, dtype: int64
indexes = df_test_features.index
df_train = pd.concat([df_train_features, df_train_targets.drop("game_time", axis=1)], axis=1)
rad_win = df_train[df_train.radiant_win == True]
dir_win = df_train[df_train.radiant_win == False]
reshape_val = rad_win.shape[0]
dir_win_up = resample(dir_win, random_state=SEED, n_samples=reshape_val, replace=True)
df_train_balanced = pd.concat([rad_win, dir_win_up])
df_train_balanced.radiant_win.value_counts()
True 16670 False 16670 Name: radiant_win, dtype: int64
X, y = df_train_balanced.drop(['radiant_win', 'duration', 'time_remaining',
'next_roshan_team'], axis=1), df_train_balanced["radiant_win"].values.astype("int8")
preprocessor.fit(X)
df_train_features_tr = preprocessor.transform(X)
df_test_features_tr = preprocessor.transform(df_test_features)
X.shape, len(y)
((33340, 35), 33340)
df_test_features.columns
Index(['game_time', 'game_mode', 'lobby_type', 'objectives_len', 'chat_len',
'first_blood_claimed_radiant', 'r_hero_id', 'd_hero_id', 'r_denies',
'd_denies', 'r_creeps_stacked', 'd_creeps_stacked', 'r_camps_stacked',
'd_camps_stacked', 'r_teamfight_participation',
'd_teamfight_participation', 'r_roshans_killed', 'd_roshans_killed',
'r_kda', 'd_kda', 'r_dist_to_anc', 'd_dist_to_anc', 'r_gold_prop',
'r_towers_killed_per_time', 'd_towers_killed_per_time',
'r_lh_per_time_prop', 'r_xp_per_time_prop', 'r_health_per_time_prop',
'r_max_health_per_time_prop', 'r_max_mana_per_time_prop',
'r_level_per_time_prop', 'r_stuns_per_time_prop',
'r_rune_pickups_per_time_prop', 'r_obs_placed_per_time_prop',
'r_sen_placed_per_time_prop'],
dtype='object')
Now let's try Random Forest. It is not bad so far.
%%time
# hero ids as win proportion and sumed up
X = df_train_features_tr
y = y
rf_model = RandomForestClassifier(n_estimators=700, max_depth=7, n_jobs=-1, random_state=SEED)
cv_scores_rf = cross_val_score(rf_model, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {cv_scores_rf.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.8400976564665406 CPU times: user 8.05 s, sys: 764 ms, total: 8.81 s Wall time: 39.9 s
# hero ids as win proportion and sumed up
rf_model = RandomForestClassifier(n_estimators=700, max_depth=7, n_jobs=-1, random_state=SEED)
rf_model.fit(X, y)
X_test = df_test_features_tr
rf_model
y_test_pred = rf_model.predict_proba(X_test)[:, 1]
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
index=indexes)
submission_filename = "submission_{}.csv".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))
Файл посылки сохранен, как: submission_2023-04-08_17-48-19.csv
ftr_imps = pd.DataFrame(list(zip(df_test_features.columns, rf_model.feature_importances_)), columns=["feature", "importance"])
sns.barplot(data=ftr_imps.sort_values("importance", ascending=False),
x="importance", y="feature",
linewidth=1.5, edgecolor="teal", facecolor='teal', alpha=0.5)
;
''
Will these data transformations work on the same model better on extended DF?
df_train_features_extended.head()
| game_time | game_mode | lobby_type | objectives_len | chat_len | r1_hero_id | r1_kills | r1_deaths | r1_assists | r1_denies | ... | d4_nearby_creep_death_count | d4_pred_vict | d5_ability_upgrades | d5_ability_uses | d5_purchases | d5_item_uses | d5_damage_given | d5_damage_taken | d5_nearby_creep_death_count | d5_pred_vict | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| match_id_hash | |||||||||||||||||||||
| b9c57c450ce74a2af79c9ce96fac144d | 658 | 4 | 0 | 3 | 10 | 15 | 7 | 2 | 0 | 7 | ... | 55.0 | False | 4.0 | 7.0 | 12.0 | 9.0 | 2308.0 | 2154.0 | 27.0 | False |
| 6db558535151ea18ca70a6892197db41 | 21 | 23 | 0 | 0 | 0 | 101 | 0 | 0 | 0 | 0 | ... | 0.0 | False | 0.0 | 0.0 | 4.0 | 1.0 | 0.0 | 0.0 | 0.0 | False |
| 19c39fe2af2b547e48708ca005c6ae74 | 160 | 22 | 7 | 0 | 0 | 57 | 0 | 0 | 0 | 1 | ... | 11.0 | False | 0.0 | 2.0 | 9.0 | 2.0 | 4496.0 | 948.0 | 9.0 | False |
| c96d629dc0c39f0c616d1949938a6ba6 | 1016 | 22 | 0 | 1 | 0 | 119 | 0 | 3 | 3 | 5 | ... | 133.0 | False | 5.0 | 32.0 | 21.0 | 14.0 | 4682.0 | 1608.0 | 108.0 | False |
| 156c88bff4e9c4668b0f53df3d870f1b | 582 | 22 | 7 | 2 | 2 | 12 | 3 | 1 | 2 | 9 | ... | 75.0 | False | 2.0 | 19.0 | 14.0 | 8.0 | 2156.0 | 1593.0 | 32.0 | False |
5 rows × 328 columns
df_train_targets.head()
| game_time | radiant_win | duration | time_remaining | next_roshan_team | |
|---|---|---|---|---|---|
| match_id_hash | |||||
| b9c57c450ce74a2af79c9ce96fac144d | 658 | True | 1154 | 496 | NaN |
| 6db558535151ea18ca70a6892197db41 | 21 | True | 1503 | 1482 | Radiant |
| 19c39fe2af2b547e48708ca005c6ae74 | 160 | False | 2063 | 1903 | NaN |
| c96d629dc0c39f0c616d1949938a6ba6 | 1016 | True | 2147 | 1131 | Radiant |
| 156c88bff4e9c4668b0f53df3d870f1b | 582 | False | 1927 | 1345 | Dire |
df_test_features_extended.head()
| game_time | game_mode | lobby_type | objectives_len | chat_len | r1_hero_id | r1_kills | r1_deaths | r1_assists | r1_denies | ... | d4_nearby_creep_death_count | d4_pred_vict | d5_ability_upgrades | d5_ability_uses | d5_purchases | d5_item_uses | d5_damage_given | d5_damage_taken | d5_nearby_creep_death_count | d5_pred_vict | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| match_id_hash | |||||||||||||||||||||
| a400b8f29dece5f4d266f49f1ae2e98a | 155 | 22 | 7 | 1 | 11 | 11 | 0 | 0 | 0 | 0 | ... | 20.0 | False | 0.0 | 4.0 | 6.0 | 3.0 | 2332.0 | 681.0 | 17.0 | False |
| 46a0ddce8f7ed2a8d9bd5edcbb925682 | 576 | 22 | 7 | 1 | 4 | 14 | 1 | 0 | 3 | 1 | ... | 33.0 | True | 3.0 | 19.0 | 13.0 | 11.0 | 3955.0 | 3317.0 | 75.0 | False |
| b1b35ff97723d9b7ade1c9c3cf48f770 | 453 | 22 | 7 | 1 | 3 | 42 | 0 | 1 | 1 | 0 | ... | 43.0 | False | 2.0 | 9.0 | 7.0 | 6.0 | 10739.0 | 2785.0 | 65.0 | False |
| ab3cc6ccac661a1385e73a2e9f21313a | 721 | 4 | 0 | 2 | 1 | 30 | 2 | 2 | 1 | 3 | ... | 62.0 | False | 7.0 | 7.0 | 14.0 | 10.0 | 15255.0 | 5052.0 | 99.0 | False |
| 54aaab1cb8cc5df3c253641618673266 | 752 | 22 | 7 | 1 | 0 | 8 | 2 | 0 | 2 | 8 | ... | 95.0 | False | 5.0 | 64.0 | 23.0 | 11.0 | 16086.0 | 5782.0 | 84.0 | False |
5 rows × 328 columns
%%time
# base model
X = df_train_features_extended.values
y = df_train_targets["radiant_win"].values.astype("int8")
rf_model = RandomForestClassifier(n_estimators=300, max_depth=7, n_jobs=-1, random_state=SEED)
cv_scores_rf = cross_val_score(rf_model, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {cv_scores_rf.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.7787220332559414 CPU times: user 22.6 s, sys: 1.45 s, total: 24 s Wall time: 48.8 s
Now feature transformation as I did with the original data
KDA
for team in ["r", "d"]:
for player in range(1, 6):
df_train_features_extended[f"{team}{player}_kda"] = (df_train_features_extended[f"{team}{player}_kills"] + df_train_features_extended[f"{team}{player}_assists"]) / (df_train_features_extended[f"{team}{player}_deaths"] + 1)
df_test_features_extended[f"{team}{player}_kda"] = (df_test_features_extended[f"{team}{player}_kills"] + df_test_features_extended[f"{team}{player}_assists"]) / (df_test_features_extended[f"{team}{player}_deaths"] + 1)
drop_list = []
for team in ["r", "d"]:
for player in range(1, 6):
drop_list.append(f"{team}{player}_kills")
drop_list.append(f"{team}{player}_assists")
drop_list.append(f"{team}{player}_deaths")
df_train_features_extended = df_train_features_extended.drop(drop_list, axis=1)
df_test_features_extended = df_test_features_extended.drop(drop_list, axis=1)
rad_first_blood_features_list = ["r1_firstblood_claimed", "r2_firstblood_claimed", "r3_firstblood_claimed", "r4_firstblood_claimed", "r5_firstblood_claimed"]
dire_first_blood_features_list = ["d1_firstblood_claimed", "d2_firstblood_claimed", "d3_firstblood_claimed", "d4_firstblood_claimed", "d5_firstblood_claimed"]
df_train_features_extended["first_blood_claimed_radiant"] = df_train_features_extended[rad_first_blood_features_list].sum(axis=1)
df_train_features_extended = df_train_features_extended.drop(rad_first_blood_features_list + dire_first_blood_features_list, axis=1)
df_test_features_extended["first_blood_claimed_radiant"] = df_test_features_extended[rad_first_blood_features_list].sum(axis=1)
df_test_features_extended = df_test_features_extended.drop(rad_first_blood_features_list + dire_first_blood_features_list, axis=1)
%%time
# kda and first blood claim by team
X = df_train_features_extended.values
y = df_train_targets["radiant_win"].values.astype("int8")
rf_model = RandomForestClassifier(n_estimators=300, max_depth=7, n_jobs=-1, random_state=SEED)
cv_scores_rf = cross_val_score(rf_model, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {cv_scores_rf.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.7852818518586497 CPU times: user 8.13 s, sys: 524 ms, total: 8.66 s Wall time: 38.3 s
for cmnd in ["r", "d"]:
for plr in range(1, 6):
plr_id = f"{cmnd}{plr}"
plr_ftrs_lst = [ftr for ftr in df_train_features_extended.columns if ftr.startswith(plr_id)]
sns.heatmap(df_train_features_extended[plr_ftrs_lst].corr() > 0.6, alpha=0.1, cbar=False)
Hero ID as success rate
df_train_features_extended
| game_time | game_mode | lobby_type | objectives_len | chat_len | r1_hero_id | r1_denies | r1_gold | r1_lh | r1_xp | ... | r2_kda | r3_kda | r4_kda | r5_kda | d1_kda | d2_kda | d3_kda | d4_kda | d5_kda | first_blood_claimed_radiant | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| match_id_hash | |||||||||||||||||||||
| b9c57c450ce74a2af79c9ce96fac144d | 658 | 4 | 0 | 3 | 10 | 15 | 7 | 5257 | 52 | 3937 | ... | 2.500000 | 2.500000 | 7.000000 | 6.000000 | 0.333333 | 0.142857 | 0.500000 | 0.500000 | 0.000000 | 0 |
| 6db558535151ea18ca70a6892197db41 | 21 | 23 | 0 | 0 | 0 | 101 | 0 | 176 | 0 | 0 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 |
| 19c39fe2af2b547e48708ca005c6ae74 | 160 | 22 | 7 | 0 | 0 | 57 | 1 | 403 | 0 | 359 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 |
| c96d629dc0c39f0c616d1949938a6ba6 | 1016 | 22 | 0 | 1 | 0 | 119 | 5 | 3085 | 1 | 2828 | ... | 1.333333 | 1.000000 | 2.000000 | 1.333333 | 0.800000 | 4.000000 | 2.333333 | 1.500000 | 1.500000 | 1 |
| 156c88bff4e9c4668b0f53df3d870f1b | 582 | 22 | 7 | 2 | 2 | 12 | 9 | 3516 | 40 | 3964 | ... | 0.428571 | 2.000000 | 0.500000 | 0.000000 | 3.000000 | 0.666667 | 5.000000 | 1.000000 | 1.666667 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| b794aa55646841a03c7783226e6f6bc8 | 2666 | 22 | 0 | 23 | 8 | 99 | 3 | 19850 | 251 | 25448 | ... | 1.818182 | 1.666667 | 2.857143 | 2.428571 | 1.352941 | 3.444444 | 13.000000 | 4.666667 | 3.200000 | 0 |
| 308faee28efee2e66b39f9f2ba6ea9cf | 2525 | 22 | 0 | 15 | 5 | 22 | 3 | 13914 | 151 | 19592 | ... | 1.571429 | 5.000000 | 15.000000 | 2.500000 | 1.750000 | 1.142857 | 1.200000 | 0.250000 | 0.625000 | 0 |
| 6066cc7417b43c749d551e123d00f0c8 | 1002 | 4 | 0 | 4 | 0 | 1 | 0 | 4613 | 59 | 4478 | ... | 0.600000 | 0.428571 | 1.000000 | 1.250000 | 8.000000 | 5.000000 | 3.333333 | 2.400000 | 5.000000 | 0 |
| bc7a87ed5f9c2bca55f9f7a93da0b0c5 | 377 | 22 | 7 | 1 | 0 | 14 | 1 | 809 | 2 | 1102 | ... | 0.500000 | 1.000000 | 2.000000 | 0.000000 | 0.500000 | 1.000000 | 0.500000 | 0.000000 | 0.000000 | 1 |
| e2ca68ac1a6847f4a37f6c9c8ee8695b | 643 | 22 | 7 | 1 | 23 | 63 | 2 | 1747 | 4 | 2677 | ... | 0.500000 | 0.090909 | 1.000000 | 0.000000 | 4.000000 | 1.000000 | 2.500000 | 6.000000 | 10.000000 | 0 |
31698 rows × 299 columns
plrs = [f"{i}_hero_id" for i in plrs]
for plr in plrs:
df_train_features_extended[plr] = df_train_features_extended[plr].apply(lambda x: win_proportion_dict[x])
df_test_features_extended[plr] = df_test_features_extended[plr].apply(lambda x: win_proportion_dict[x])
X and y => distances from Ancients
ancient_coord = (x_min, y_min)
plrs = []
for tm in ["r", "d"]:
for plr in range(1, 6):
plrs.append(f"{tm}{plr}")
df_train_features_extended = distance_to_ancient(df_train_features_extended, ancient_coord[0], ancient_coord[1])
df_test_features_extended = distance_to_ancient(df_test_features_extended, ancient_coord[0], ancient_coord[1])
Sum up features per team
plr_ftrs_lst = [ftr[3:] for ftr in df_train_features_extended.columns if ftr.startswith("r1")]
for feature in plr_ftrs_lst:
df_train_features_extended = sum_up_ftrs_by_plr(df=df_train_features_extended, ftr=feature)
df_test_features_extended = sum_up_ftrs_by_plr(df=df_test_features_extended, ftr=feature)
%%time
# sumed featues
X = df_train_features_extended.values
y = df_train_targets["radiant_win"].values.astype("int8")
rf_model = RandomForestClassifier(n_estimators=700, max_depth=19, n_jobs=-1, random_state=SEED)
cv_scores_rf = cross_val_score(rf_model, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {cv_scores_rf.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.8208474507971347 CPU times: user 16.1 s, sys: 979 ms, total: 17.1 s Wall time: 1min 20s
sns.heatmap(df_train_features_extended.corr() > 0.6)
;
''
Gold proportion and other features proportion (normalized per time but as far as they're proportions it does not make any sence but anyway i'll leave it like that)
df_train_features_extended["r_gold_prop"] = df_train_features_extended.r_gold / (df_train_features_extended.r_gold + df_train_features_extended.d_gold + 1)
df_test_features_extended["r_gold_prop"] = df_test_features_extended.r_gold / (df_test_features_extended.r_gold + df_test_features_extended.d_gold + 1)
df_train_features_extended = df_train_features_extended.drop(["r_gold", "d_gold"], axis=1)
df_test_features_extended = df_test_features_extended.drop(["r_gold", "d_gold"], axis=1)
df_test_features_extended.columns
Index(['game_time', 'game_mode', 'lobby_type', 'objectives_len', 'chat_len',
'radiant_tower_kills', 'dire_tower_kills', 'diff_tower_kills',
'first_blood_claimed_radiant', 'r_hero_id', 'd_hero_id', 'r_denies',
'd_denies', 'r_lh', 'd_lh', 'r_xp', 'd_xp', 'r_health', 'd_health',
'r_max_health', 'd_max_health', 'r_max_mana', 'd_max_mana', 'r_level',
'd_level', 'r_stuns', 'd_stuns', 'r_creeps_stacked', 'd_creeps_stacked',
'r_camps_stacked', 'd_camps_stacked', 'r_rune_pickups',
'd_rune_pickups', 'r_teamfight_participation',
'd_teamfight_participation', 'r_towers_killed', 'd_towers_killed',
'r_roshans_killed', 'd_roshans_killed', 'r_obs_placed', 'd_obs_placed',
'r_sen_placed', 'd_sen_placed', 'r_ability_upgrades',
'd_ability_upgrades', 'r_ability_uses', 'd_ability_uses', 'r_purchases',
'd_purchases', 'r_item_uses', 'd_item_uses', 'r_damage_given',
'd_damage_given', 'r_damage_taken', 'd_damage_taken',
'r_nearby_creep_death_count', 'd_nearby_creep_death_count',
'r_pred_vict', 'd_pred_vict', 'r_kda', 'd_kda', 'r_dist_to_anc',
'd_dist_to_anc', 'r_gold_prop'],
dtype='object')
per_time_norm_list = ['r_lh', 'd_lh',
'r_xp', 'd_xp', 'r_health', 'd_health', 'r_max_health', 'd_max_health',
'r_max_mana', 'd_max_mana', 'r_level', 'd_level', 'r_stuns', 'd_stuns',
'r_rune_pickups', 'd_rune_pickups', 'r_towers_killed', 'd_towers_killed',
'r_obs_placed', 'd_obs_placed', 'r_sen_placed', 'd_sen_placed',
'r_ability_upgrades', 'd_ability_upgrades', 'r_ability_uses', 'd_ability_uses',
'r_purchases', 'd_purchases', 'r_item_uses', 'd_item_uses',
'r_damage_given', 'd_damage_given', 'r_damage_taken', 'd_damage_taken',
'r_nearby_creep_death_count', 'd_nearby_creep_death_count']
for ftr in per_time_norm_list:
df_train_features_extended[f"{ftr}_per_time"] = df_train_features_extended[f"{ftr}"] / (df_train_features_extended["game_time"] + 1)
df_test_features_extended[f"{ftr}_per_time"] = df_test_features_extended[f"{ftr}"] / (df_test_features_extended["game_time"] + 1)
df_train_features_extended = df_train_features_extended.drop(f"{ftr}", axis=1)
df_test_features_extended = df_test_features_extended.drop(f"{ftr}", axis=1)
sns.heatmap(df_train_features_extended.corr() > 0.6)
;
''
count all new features as proportion per team (which kills per time normalization, but okay)
disbalance = ['lh_per_time',
'xp_per_time',
'health_per_time',
'max_health_per_time',
'max_mana_per_time',
'level_per_time',
'stuns_per_time',
'rune_pickups_per_time',
'obs_placed_per_time',
'sen_placed_per_time',
'ability_upgrades_per_time',
'ability_uses_per_time',
'purchases_per_time',
'item_uses_per_time',
'damage_given_per_time',
'damage_taken_per_time',
'nearby_creep_death_count_per_time']
for ftr in disbalance:
df_train_features_extended[f"r_{ftr}_prop"] = df_train_features_extended[f"r_{ftr}"] / (df_train_features_extended[f"r_{ftr}"] + df_train_features_extended[f"d_{ftr}"] + 1)
df_test_features_extended[f"r_{ftr}_prop"] = df_test_features_extended[f"r_{ftr}"] / (df_test_features_extended[f"r_{ftr}"] + df_test_features_extended[f"d_{ftr}"] + 1)
df_train_features_extended = df_train_features_extended.drop([f"r_{ftr}", f"d_{ftr}"], axis=1)
df_test_features_extended = df_test_features_extended.drop([f"r_{ftr}", f"d_{ftr}"], axis=1)
sns.heatmap(df_train_features_extended.corr() > 0.8)
;
''
Pred_vict as difference between radiant and dire
df_train_features_extended['pred_vict_difference'] = df_train_features_extended.r_pred_vict - df_train_features_extended.d_pred_vict
df_test_features_extended['pred_vict_difference'] = df_test_features_extended.r_pred_vict - df_test_features_extended.d_pred_vict
df_train_features_extended = df_train_features_extended.drop(['r_pred_vict', 'd_pred_vict'], axis=1)
df_test_features_extended = df_test_features_extended.drop(['r_pred_vict', 'd_pred_vict'], axis=1)
Upsampling + standardation
num_cols = df_train_features_extended.columns.tolist()
num_cols.remove('first_blood_claimed_radiant')
cat_cols = ['first_blood_claimed_radiant']
preprocessor = ColumnTransformer(
transformers=[("scaler", StandardScaler(), num_cols),
("ohe", OneHotEncoder(drop="first"), cat_cols)]
)
indexes = df_test_features.index
df_train_targets.radiant_win.value_counts()
True 16670 False 15028 Name: radiant_win, dtype: int64
df_train_ext = pd.concat([df_train_features_extended, df_train_targets.drop("game_time", axis=1)], axis=1)
rad_win = df_train_ext[df_train_ext.radiant_win == True]
dir_win = df_train_ext[df_train_ext.radiant_win == False]
reshape_val = rad_win.shape[0]
dir_win_up = resample(dir_win, random_state=SEED, n_samples=reshape_val, replace=True)
df_train_ext_balanced = pd.concat([rad_win, dir_win_up])
df_train_ext_balanced.radiant_win.value_counts()
True 16670 False 16670 Name: radiant_win, dtype: int64
X, y = df_train_ext_balanced.drop(['radiant_win', 'duration', 'time_remaining',
'next_roshan_team'], axis=1), df_train_ext_balanced["radiant_win"].values.astype("int8")
preprocessor.fit(X)
df_train_features_tr = preprocessor.transform(X)
df_test_features_tr = preprocessor.transform(df_test_features_extended)
X.shape, len(y)
((33340, 46), 33340)
Now, applying RandomForest to this extended dotaset
%%time
# extended dotaset
X = df_train_features_tr
y = y
rf_model = RandomForestClassifier(n_estimators=700, max_depth=7, n_jobs=-1, random_state=SEED)
cv_scores_rf = cross_val_score(rf_model, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {cv_scores_rf.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.8408896301884274 CPU times: user 9.9 s, sys: 866 ms, total: 10.8 s Wall time: 44 s
# extended dotaset BEST AS FAR on kaggle it gave 0.84015 score
rf_model = RandomForestClassifier(n_estimators=700, max_depth=7, n_jobs=-1, random_state=SEED)
rf_model.fit(X, y)
X_test = df_test_features_tr
rf_model
y_test_pred = rf_model.predict_proba(X_test)[:, 1]
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
index=indexes)
submission_filename = "submission_{}.csv".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))
Файл посылки сохранен, как: submission_2023-04-09_19-03-11.csv
ftr_imps = pd.DataFrame(list(zip(df_test_features_extended.columns, rf_model.feature_importances_)), columns=["feature", "importance"])
sns.barplot(data=ftr_imps.sort_values("importance", ascending=False),
x="importance", y="feature",
linewidth=1.5, edgecolor="teal", facecolor='teal', alpha=0.5)
;
''
Now I made sure that extended DF is better and want to try other models.
import xgboost
import catboost
%%time
# extended dotaset
X = df_train_features_tr
y = y
cat_mdl = catboost.CatBoostClassifier(verbose=False, random_state=SEED,
n_estimators=350, max_depth=11)
cv_scores_cb = cross_val_score(cat_mdl, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {cv_scores_cb.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.912193335535936 CPU times: user 19min 10s, sys: 17.8 s, total: 19min 28s Wall time: 3min 39s
%%time
# extended dotaset
X = df_train_features_tr
y = y
cat_mdl = catboost.CatBoostClassifier(verbose=False, random_state=SEED,
n_estimators=700, max_depth=11)
cv_scores_cb = cross_val_score(cat_mdl, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {cv_scores_cb.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.9150766864082801 CPU times: user 38min 54s, sys: 38.9 s, total: 39min 33s Wall time: 7min 18s
# extended dotaset is worse than RF
cat_mdl = catboost.CatBoostClassifier(verbose=False, random_state=SEED,
n_estimators=700, max_depth=11)
cat_mdl.fit(X, y)
X_test = df_test_features_tr
cat_mdl
y_test_pred = cat_mdl.predict_proba(X_test)[:, 1]
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
index=indexes)
submission_filename = "submission_{}.csv".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))
Файл посылки сохранен, как: submission_2023-04-09_20-16-30.csv
%%time
# extended dotaset
X = df_train_features_tr
y = y
xgb_mdl = xgboost.XGBRFClassifier(n_estimators=240, max_depth=7, random_state=SEED)
xgb_scores_cb = cross_val_score(xgb_mdl, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {xgb_scores_cb.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.8478161514415096 CPU times: user 6min 31s, sys: 2.51 s, total: 6min 33s Wall time: 53.7 s
%%time
# extended dotaset
X = df_train_features_tr
y = y
xgb_mdl = xgboost.XGBRFClassifier(n_estimators=700, max_depth=7, random_state=SEED)
xgb_scores_cb = cross_val_score(xgb_mdl, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {xgb_scores_cb.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.8478865222757868 CPU times: user 21min 22s, sys: 11.7 s, total: 21min 34s Wall time: 3min 2s
# extended dotaset slightly better than RF BEST AS FAR
xgb_mdl = xgboost.XGBRFClassifier(n_estimators=700, max_depth=7, random_state=SEED)
xgb_mdl.fit(X, y)
X_test = df_test_features_tr
xgb_mdl
y_test_pred = xgb_mdl.predict_proba(X_test)[:, 1]
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
index=indexes)
submission_filename = "submission_{}.csv".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))
Файл посылки сохранен, как: submission_2023-04-09_20-20-42.csv
Okay, I'll try to find best hyperparameters for XGBRFClassifier
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
rf_random.best_params_
random_grid = {
'max_depth': [1, 5, 50, -1],
'n_estimators':[1, 10 , 100],
'learning_rate': np.linspace(0.01,0.5,4),
'reg_alpha': [0, 0.2, 1],
'reg_lambda': [0, 0.2, 1],
}
xgb_mdl = xgboost.XGBRFClassifier()
xgb_rand_search = RandomizedSearchCV(estimator = xgb_mdl, param_distributions = random_grid, cv = 3, verbose=2, random_state=42, n_jobs = -1)
xgb_rand_search.fit(X, y)
xgb_rand_search.best_params_
Fitting 3 folds for each of 10 candidates, totalling 30 fits [CV] END learning_rate=0.5, max_depth=-1, n_estimators=100, reg_alpha=0, reg_lambda=0.2; total time= 0.1s [CV] END learning_rate=0.5, max_depth=-1, n_estimators=100, reg_alpha=0, reg_lambda=0.2; total time= 0.1s [CV] END learning_rate=0.5, max_depth=-1, n_estimators=100, reg_alpha=0, reg_lambda=0.2; total time= 0.1s [CV] END learning_rate=0.01, max_depth=5, n_estimators=1, reg_alpha=0.2, reg_lambda=0; total time= 0.3s [CV] END learning_rate=0.01, max_depth=5, n_estimators=1, reg_alpha=0.2, reg_lambda=0; total time= 0.3s [CV] END learning_rate=0.01, max_depth=5, n_estimators=1, reg_alpha=0.2, reg_lambda=0; total time= 0.3s [CV] END learning_rate=0.5, max_depth=50, n_estimators=10, reg_alpha=0.2, reg_lambda=1; total time= 5.2s [CV] END learning_rate=0.5, max_depth=50, n_estimators=10, reg_alpha=0.2, reg_lambda=1; total time= 5.7s [CV] END learning_rate=0.33666666666666667, max_depth=50, n_estimators=1, reg_alpha=0.2, reg_lambda=1; total time= 0.7s [CV] END learning_rate=0.33666666666666667, max_depth=50, n_estimators=1, reg_alpha=0.2, reg_lambda=1; total time= 0.7s [CV] END learning_rate=0.33666666666666667, max_depth=50, n_estimators=1, reg_alpha=0.2, reg_lambda=1; total time= 0.7s [CV] END learning_rate=0.5, max_depth=-1, n_estimators=10, reg_alpha=0, reg_lambda=0; total time= 0.0s [CV] END learning_rate=0.5, max_depth=-1, n_estimators=10, reg_alpha=0, reg_lambda=0; total time= 0.0s [CV] END learning_rate=0.5, max_depth=-1, n_estimators=10, reg_alpha=0, reg_lambda=0; total time= 0.0s [CV] END learning_rate=0.5, max_depth=50, n_estimators=10, reg_alpha=0.2, reg_lambda=1; total time= 5.4s [CV] END learning_rate=0.17333333333333334, max_depth=5, n_estimators=100, reg_alpha=0, reg_lambda=0.2; total time= 14.1s [CV] END learning_rate=0.17333333333333334, max_depth=5, n_estimators=100, reg_alpha=0, reg_lambda=0.2; total time= 14.9s [CV] END learning_rate=0.17333333333333334, max_depth=-1, n_estimators=1, reg_alpha=1, reg_lambda=0; total time= 0.0s [CV] END learning_rate=0.17333333333333334, max_depth=-1, n_estimators=1, reg_alpha=1, reg_lambda=0; total time= 0.0s [CV] END learning_rate=0.17333333333333334, max_depth=-1, n_estimators=1, reg_alpha=1, reg_lambda=0; total time= 0.0s [CV] END learning_rate=0.17333333333333334, max_depth=5, n_estimators=10, reg_alpha=0.2, reg_lambda=0.2; total time= 1.9s [CV] END learning_rate=0.17333333333333334, max_depth=5, n_estimators=10, reg_alpha=0.2, reg_lambda=0.2; total time= 1.7s [CV] END learning_rate=0.17333333333333334, max_depth=5, n_estimators=10, reg_alpha=0.2, reg_lambda=0.2; total time= 1.8s [CV] END learning_rate=0.17333333333333334, max_depth=5, n_estimators=100, reg_alpha=0, reg_lambda=0.2; total time= 13.6s [CV] END learning_rate=0.17333333333333334, max_depth=50, n_estimators=100, reg_alpha=0, reg_lambda=0; total time= 57.3s [CV] END learning_rate=0.01, max_depth=50, n_estimators=100, reg_alpha=0.2, reg_lambda=0; total time= 57.9s [CV] END learning_rate=0.01, max_depth=50, n_estimators=100, reg_alpha=0.2, reg_lambda=0; total time= 59.3s [CV] END learning_rate=0.17333333333333334, max_depth=50, n_estimators=100, reg_alpha=0, reg_lambda=0; total time= 59.7s [CV] END learning_rate=0.01, max_depth=50, n_estimators=100, reg_alpha=0.2, reg_lambda=0; total time= 59.2s [CV] END learning_rate=0.17333333333333334, max_depth=50, n_estimators=100, reg_alpha=0, reg_lambda=0; total time= 59.4s
{'reg_lambda': 0,
'reg_alpha': 0,
'n_estimators': 100,
'max_depth': 50,
'learning_rate': 0.17333333333333334}
%%time
best_xgb = xgb_rand_search.best_estimator_
best_xgb_scores_cb = cross_val_score(best_xgb, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {best_xgb_scores_cb.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.9107475944056039 CPU times: user 7min 13s, sys: 3.84 s, total: 7min 17s Wall time: 58.9 s
# extended dotaset + better params(?) slightly better than RF
xgb_mdl = best_xgb
xgb_mdl.fit(X, y)
X_test = df_test_features_tr
xgb_mdl
y_test_pred = xgb_mdl.predict_proba(X_test)[:, 1]
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
index=indexes)
submission_filename = "submission_{}.csv".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))
Файл посылки сохранен, как: submission_2023-04-09_20-44-23.csv
random_grid = {
'max_depth': np.arange(1, 30, 5),
'n_estimators': np.arange(50, 500, 10),
'learning_rate': np.linspace(0.01,0.5,4),
'reg_alpha': [0, 0.2, 1],
'reg_lambda': [0, 0.2, 1],
}
xgb_mdl = xgboost.XGBRFClassifier()
xgb_rand_search = RandomizedSearchCV(estimator = xgb_mdl, param_distributions = random_grid, cv = 3, verbose=2, random_state=42, n_jobs = -1)
xgb_rand_search.fit(X, y)
xgb_rand_search.best_params_
Fitting 3 folds for each of 10 candidates, totalling 30 fits [CV] END learning_rate=0.33666666666666667, max_depth=6, n_estimators=180, reg_alpha=1, reg_lambda=1; total time= 30.5s [CV] END learning_rate=0.33666666666666667, max_depth=6, n_estimators=180, reg_alpha=1, reg_lambda=1; total time= 30.5s [CV] END learning_rate=0.01, max_depth=11, n_estimators=100, reg_alpha=0.2, reg_lambda=1; total time= 34.8s [CV] END learning_rate=0.01, max_depth=11, n_estimators=100, reg_alpha=0.2, reg_lambda=1; total time= 35.8s [CV] END learning_rate=0.01, max_depth=11, n_estimators=100, reg_alpha=0.2, reg_lambda=1; total time= 38.2s [CV] END learning_rate=0.33666666666666667, max_depth=1, n_estimators=410, reg_alpha=1, reg_lambda=0.2; total time= 13.8s [CV] END learning_rate=0.33666666666666667, max_depth=1, n_estimators=410, reg_alpha=1, reg_lambda=0.2; total time= 11.9s [CV] END learning_rate=0.33666666666666667, max_depth=1, n_estimators=410, reg_alpha=1, reg_lambda=0.2; total time= 13.1s [CV] END learning_rate=0.33666666666666667, max_depth=6, n_estimators=180, reg_alpha=1, reg_lambda=1; total time= 35.6s [CV] END learning_rate=0.33666666666666667, max_depth=11, n_estimators=120, reg_alpha=0, reg_lambda=0.2; total time= 48.7s [CV] END learning_rate=0.33666666666666667, max_depth=11, n_estimators=120, reg_alpha=0, reg_lambda=0.2; total time= 47.9s [CV] END learning_rate=0.33666666666666667, max_depth=11, n_estimators=120, reg_alpha=0, reg_lambda=0.2; total time= 48.3s [CV] END learning_rate=0.01, max_depth=6, n_estimators=110, reg_alpha=1, reg_lambda=0.2; total time= 20.2s [CV] END learning_rate=0.01, max_depth=6, n_estimators=110, reg_alpha=1, reg_lambda=0.2; total time= 21.1s [CV] END learning_rate=0.01, max_depth=6, n_estimators=110, reg_alpha=1, reg_lambda=0.2; total time= 21.6s [CV] END learning_rate=0.33666666666666667, max_depth=16, n_estimators=260, reg_alpha=0, reg_lambda=0.2; total time= 2.5min [CV] END learning_rate=0.33666666666666667, max_depth=16, n_estimators=260, reg_alpha=0, reg_lambda=0.2; total time= 2.6min [CV] END learning_rate=0.33666666666666667, max_depth=16, n_estimators=260, reg_alpha=0, reg_lambda=0.2; total time= 2.8min [CV] END learning_rate=0.33666666666666667, max_depth=26, n_estimators=470, reg_alpha=1, reg_lambda=0.2; total time= 5.0min [CV] END learning_rate=0.33666666666666667, max_depth=26, n_estimators=470, reg_alpha=1, reg_lambda=0.2; total time= 5.1min [CV] END learning_rate=0.33666666666666667, max_depth=6, n_estimators=390, reg_alpha=1, reg_lambda=0.2; total time= 1.4min [CV] END learning_rate=0.33666666666666667, max_depth=26, n_estimators=470, reg_alpha=1, reg_lambda=0.2; total time= 5.2min [CV] END learning_rate=0.33666666666666667, max_depth=6, n_estimators=390, reg_alpha=1, reg_lambda=0.2; total time= 1.5min [CV] END learning_rate=0.33666666666666667, max_depth=6, n_estimators=390, reg_alpha=1, reg_lambda=0.2; total time= 1.3min [CV] END learning_rate=0.17333333333333334, max_depth=21, n_estimators=460, reg_alpha=1, reg_lambda=0.2; total time= 4.9min [CV] END learning_rate=0.5, max_depth=11, n_estimators=290, reg_alpha=1, reg_lambda=0; total time= 1.8min [CV] END learning_rate=0.5, max_depth=11, n_estimators=290, reg_alpha=1, reg_lambda=0; total time= 1.9min [CV] END learning_rate=0.5, max_depth=11, n_estimators=290, reg_alpha=1, reg_lambda=0; total time= 1.8min [CV] END learning_rate=0.17333333333333334, max_depth=21, n_estimators=460, reg_alpha=1, reg_lambda=0.2; total time= 4.8min [CV] END learning_rate=0.17333333333333334, max_depth=21, n_estimators=460, reg_alpha=1, reg_lambda=0.2; total time= 4.3min
{'reg_lambda': 0.2,
'reg_alpha': 0,
'n_estimators': 260,
'max_depth': 16,
'learning_rate': 0.33666666666666667}
%%time
best_xgb = xgb_rand_search.best_estimator_
best_xgb_scores_cb = cross_val_score(best_xgb, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {best_xgb_scores_cb.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.9055695068283935 CPU times: user 19min 1s, sys: 8.88 s, total: 19min 9s Wall time: 2min 38s
# extended dotaset + better params(?) slightly better than RF
xgb_mdl = best_xgb
xgb_mdl.fit(X, y)
X_test = df_test_features_tr
xgb_mdl
y_test_pred = xgb_mdl.predict_proba(X_test)[:, 1]
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
index=indexes)
submission_filename = "submission_{}.csv".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))
Файл посылки сохранен, как: submission_2023-04-09_21-01-37.csv
xgb_rand_search.best_estimator_
XGBRFClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bytree=None,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.33666666666666667,
max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=16, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=260, n_jobs=None, num_parallel_tree=None,
objective='binary:logistic', predictor=None, random_state=None, ...)
from hyperopt import fmin, hp, STATUS_OK, Trials, tpe
space = {'max_depth': hp.quniform('max_depth', 1, 18, 1),
'gamma': hp.uniform ('gamma', 1, 9),
'reg_alpha': hp.quniform('reg_alpha', 10, 100, 1),
'reg_lambda': hp.uniform('reg_lambda', 0, 1),
'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1),
'n_estimators': hp.uniform('n_estimators', 10, 100),
'seed': SEED,
}
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.25, random_state=SEED)
def objective(space):
mdl = xgboost.XGBClassifier(n_estimators=int(space['n_estimators']),
max_depth=int(space['max_depth']),
reg_alpha=int(space['reg_alpha']),
min_child_weight=int(space['min_child_weight']),
colsample_bytree=space['colsample_bytree'],
gamma=space['gamma'],
reg_lambda=space['reg_lambda'],
)
evaluation = [(X_tr, y_tr), (X_te, y_te)]
mdl.fit(X_tr, y_tr,
eval_set=evaluation, eval_metric="auc",
early_stopping_rounds=10,verbose=False)
pred = mdl.predict(X_te)
roc_auc = roc_auc_score(y_te, pred>0.5)
print ("SCORE:", roc_auc)
return {'loss': -roc_auc, 'status': STATUS_OK }
trials = Trials()
best_hyperparams = fmin(fn=objective,
space=space,
algo=tpe.suggest,
max_evals=300,
trials=trials)
SCORE: 0.7513634340883917 SCORE: 0.755151019723742 SCORE: 0.7483487634166751 SCORE: 0.7518972046419697 SCORE: 0.7559947912239934 SCORE: 0.7703491565970344 SCORE: 0.7580307721790461 SCORE: 0.7601649618486065 SCORE: 0.7578446046187981 SCORE: 0.7582323145424448 SCORE: 0.7648193811889341 SCORE: 0.7536932062257472 SCORE: 0.7518920797075863 SCORE: 0.7635502977183792 SCORE: 0.7594783358082735 SCORE: 0.7568155005195186 SCORE: 0.7574782293939057 SCORE: 0.7440342596681022 SCORE: 0.752429247689576 SCORE: 0.7588489190073796 SCORE: 0.7658441953150499 SCORE: 0.7654470704838559 SCORE: 0.7632718813392272 SCORE: 0.7598190863612483 SCORE: 0.7511465169218423 SCORE: 0.7706873446828172 SCORE: 0.7887524216754551 SCORE: 0.7893767135419654 SCORE: 0.7770352669192511 SCORE: 0.7994114502320501 SCORE: 0.7783120377913814 SCORE: 0.7755458688537353 SCORE: 0.8009520976414015 SCORE: 0.8012928481943763 SCORE: 0.756964958577862 SCORE: 0.7766381420880571 SCORE: 0.7701971360714994 SCORE: 0.7541219156244623 SCORE: 0.7697197397961404 SCORE: 0.763018254670773 SCORE: 0.7683985720205301 SCORE: 0.7775562251368703 SCORE: 0.7681278430429535 SCORE: 0.795982552420017 SCORE: 0.7621181088923025 SCORE: 0.7718718667360432 SCORE: 0.759576544522781 SCORE: 0.7511516418562258 SCORE: 0.7615220041649018 SCORE: 0.76495942433973 SCORE: 0.75321837241758 SCORE: 0.7749062021840742 SCORE: 0.7546736234483828 SCORE: 0.757832627244059 SCORE: 0.7779379751649135 SCORE: 0.767748655482102 SCORE: 0.7809108114003417 SCORE: 0.7570606048251776 SCORE: 0.756484999835311 SCORE: 0.7709452613244351 SCORE: 0.7670543420401937 SCORE: 0.7563611664488857 SCORE: 0.7524753720990279 SCORE: 0.7532465595566897 SCORE: 0.7679160508107876 SCORE: 0.7969527197738857 SCORE: 0.7956759489017553 SCORE: 0.7868300242380606 SCORE: 0.7885073173697962 SCORE: 0.7935648214369211 SCORE: 0.767807592227513 SCORE: 0.8063682895318436 SCORE: 0.8172339853862208 SCORE: 0.7773316205687455 SCORE: 0.8057721848044428 SCORE: 0.8134054002758021 SCORE: 0.7834839603373842 SCORE: 0.7941660510987054 SCORE: 0.7695702817377972 SCORE: 0.7778892882882696 SCORE: 0.752836622389537 SCORE: 0.8117793564879021 SCORE: 0.7530689143592368 SCORE: 0.7845207518382391 SCORE: 0.753419079819759 SCORE: 0.8036046830613893 SCORE: 0.7650337358882916 SCORE: 0.8007975146486748 SCORE: 0.7538093522105973 SCORE: 0.7577651681358528 SCORE: 0.7787561219932471 SCORE: 0.7824360840065423 SCORE: 0.76310108858213 SCORE: 0.8045697254808744 SCORE: 0.7512601004395005 SCORE: 0.7512806001770348 SCORE: 0.7589522526562708 SCORE: 0.7891264843019228 SCORE: 0.7597388149170831 SCORE: 0.7556617280725939 SCORE: 0.7979203246605624 SCORE: 0.7599711068867832 SCORE: 0.7699674065689912 SCORE: 0.7554584582032232 SCORE: 0.7574372299188371 SCORE: 0.7612615250560921 SCORE: 0.7531978726800458 SCORE: 0.7757089742092572 SCORE: 0.7939747586040738 SCORE: 0.7636946308423389 SCORE: 0.7980108459734945 SCORE: 0.7580000225727447 SCORE: 0.7735927218100394 SCORE: 0.7672686967395512 SCORE: 0.7734039917825996 SCORE: 0.7593750021593824 SCORE: 0.794493989315721 SCORE: 0.7593186278811632 SCORE: 0.7559794164208424 SCORE: 0.7868556489099784 SCORE: 0.8095401056634786 SCORE: 0.8038241626951304 SCORE: 0.7805495611098329 SCORE: 0.7803172691401328 SCORE: 0.7539126858594883 SCORE: 0.7693021152274123 SCORE: 0.7843038346716898 SCORE: 0.7915519026865943 SCORE: 0.7686103642526959 SCORE: 0.7879240249783543 SCORE: 0.7518510802325177 SCORE: 0.8187336333205036 SCORE: 0.767410467396319 SCORE: 0.7559922287568015 SCORE: 0.7872066493317201 SCORE: 0.7620250251121785 SCORE: 0.7612538376545168 SCORE: 0.7674394894966485 SCORE: 0.753953685334557 SCORE: 0.7770275795176756 SCORE: 0.7523566636469862 SCORE: 0.7541629150995307 SCORE: 0.7711954905644776 SCORE: 0.7839015849061124 SCORE: 0.7577130838307975 SCORE: 0.8080917070730314 SCORE: 0.81173323207845 SCORE: 0.7908242771711927 SCORE: 0.8203178426721155 SCORE: 0.8049779351420551 SCORE: 0.8034757247405804 SCORE: 0.8092762291262575 SCORE: 0.8058960181908683 SCORE: 0.8158461934636241 SCORE: 0.8042417872638585 SCORE: 0.8080327703276204 SCORE: 0.8062880180876786 SCORE: 0.7588489190073796 SCORE: 0.7828537085752706 SCORE: 0.8251456172367321 SCORE: 0.7820201869437863 SCORE: 0.8070626029737518 SCORE: 0.7774144544801026 SCORE: 0.7781318301267368 SCORE: 0.7968288863874602 SCORE: 0.7976752203549035 SCORE: 0.765134507069991 SCORE: 0.7808800617940402 SCORE: 0.7871758997254188 SCORE: 0.7567616887084911 SCORE: 0.7698384482481823 SCORE: 0.8061411224965271 SCORE: 0.7788543307077547 SCORE: 0.7540493315818726 SCORE: 0.7946613846444066 SCORE: 0.7793189146471546 SCORE: 0.7552210412991398 SCORE: 0.761542503902436 SCORE: 0.7883988587865214 SCORE: 0.7750300355704995 SCORE: 0.7861519205605226 SCORE: 0.7823711873655281 SCORE: 0.812208065886617 SCORE: 0.7961969071193746 SCORE: 0.8094675216208888 SCORE: 0.7768311620886607 SCORE: 0.7548828532133569 SCORE: 0.7995796805219555 SCORE: 0.7656349655500759 SCORE: 0.7654778200901574 SCORE: 0.7877976291247373 SCORE: 0.7671576756890848 SCORE: 0.779569143887197 SCORE: 0.7527153514703034 SCORE: 0.8108963130185541 SCORE: 0.788532942041714 SCORE: 0.7692346561192062 SCORE: 0.7724782213322111 SCORE: 0.7650106736835656 SCORE: 0.7847479188735555 SCORE: 0.7613751085737505 SCORE: 0.791740632714034 SCORE: 0.8059455400287319 SCORE: 0.762252192147495 SCORE: 0.7782505385787785 SCORE: 0.7616304627481764 SCORE: 0.7938304254801142 SCORE: 0.7590965857802303 SCORE: 0.7885406294432893 SCORE: 0.7578890015222783 SCORE: 0.8159956515219673 SCORE: 0.770935011455668 SCORE: 0.8038292876295139 SCORE: 0.7711801157613268 SCORE: 0.7596978154420146 SCORE: 0.7541577901651473 SCORE: 0.7665436336913417 SCORE: 0.7553499996199486 SCORE: 0.804933538238575 SCORE: 0.7750479728408421 SCORE: 0.7935391967650032 SCORE: 0.7623888378698792 SCORE: 0.7807587908748067 SCORE: 0.7586832511846661 SCORE: 0.8116913976421616 SCORE: 0.7553986864965925 SCORE: 0.7708478875711473 SCORE: 0.8005677851461667 SCORE: 0.7552492284382494 SCORE: 0.7711749908269433 SCORE: 0.7812848740268095 SCORE: 0.7708991369149829 SCORE: 0.8081899157875388 SCORE: 0.7528417473239205 SCORE: 0.7683857596845712 SCORE: 0.810664856010074 SCORE: 0.7579795228352104 SCORE: 0.795286511472137 SCORE: 0.7958348218676459 SCORE: 0.7618806919882188 SCORE: 0.8125744411115096 SCORE: 0.7551228325846323 SCORE: 0.7687444475078883 SCORE: 0.7567950007819844 SCORE: 0.7582656266159378 SCORE: 0.757914626194196 SCORE: 0.7695907814753316 SCORE: 0.761971213301151 SCORE: 0.7540595814506398 SCORE: 0.7669100089162342 SCORE: 0.7769165584672093 SCORE: 0.8020486608488873 SCORE: 0.7570554798907939 SCORE: 0.8086673120628979 SCORE: 0.795017510000532 SCORE: 0.7899369437286812 SCORE: 0.7611479415384339 SCORE: 0.8154021092617584 SCORE: 0.815219339129922 SCORE: 0.8018274537091741 SCORE: 0.8082906869692381 SCORE: 0.8117793564879021 SCORE: 0.813861461852407 SCORE: 0.8000673266660816 SCORE: 0.8107912518636909 SCORE: 0.7946921342507081 SCORE: 0.8011818271439098 SCORE: 0.7993730132241734 SCORE: 0.7921505698811868 SCORE: 0.7944675296825835 SCORE: 0.790124838794901 SCORE: 0.802877057545988 SCORE: 0.8090183124846396 SCORE: 0.7987999707014987 SCORE: 0.802340724525218 SCORE: 0.7911086534459483 SCORE: 0.7886285882890297 SCORE: 0.8065074977214195 SCORE: 0.810210521939441 SCORE: 0.7834959377121232 SCORE: 0.8050061222811647 SCORE: 0.8076433328980018 SCORE: 0.7871084406172126 SCORE: 0.8005549728102076 SCORE: 0.8023586617955605 SCORE: 0.7883399220411105 SCORE: 0.8143960673672048 SCORE: 0.7582861263534721 SCORE: 0.7806554572259157 SCORE: 0.7865874823995933 SCORE: 0.7835864590250556 SCORE: 0.8070113536299162 SCORE: 0.7557394370495671 SCORE: 0.8117844814222857 SCORE: 0.8035559961847455 SCORE: 0.7787663718620141 SCORE: 0.7808228525546013 SCORE: 0.7774298292832532 SCORE: 0.7969552822410775 SCORE: 0.7887788813085926 100%|██████████| 300/300 [28:59<00:00, 5.80s/trial, best loss: -0.8251456172367321]
print(best_hyperparams)
{'colsample_bytree': 0.7612928819881977, 'gamma': 1.1644796265046675, 'max_depth': 14.0, 'min_child_weight': 1.0, 'n_estimators': 84.47773701271012, 'reg_alpha': 10.0, 'reg_lambda': 0.605687781863435}
%%time
# extended dotaset xgbrf params from hyperopt
X = df_train_features_tr
y = y
xgb_mdl = xgboost.XGBRFClassifier(n_estimators=100, max_depth=14, colsample_bytree=0.76, gamma=1.16, min_child_weight=1, reg_alpha=10, reg_lambda=0.6, random_state=SEED)
xgb_scores_cb = cross_val_score(xgb_mdl, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {xgb_scores_cb.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.8488666886961663 CPU times: user 3min 52s, sys: 2.04 s, total: 3min 54s Wall time: 32.3 s
%%time
# extended dotaset xgbrf params from hyperopt
X = df_train_features_tr
y = y
xgb_mdl = xgboost.XGBRFClassifier(n_estimators=200, max_depth=14, colsample_bytree=0.76, gamma=1.16, min_child_weight=1, reg_alpha=10, reg_lambda=0.6, random_state=SEED)
xgb_scores_cb = cross_val_score(xgb_mdl, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {xgb_scores_cb.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.8490291232311364 CPU times: user 8min 31s, sys: 4.3 s, total: 8min 35s Wall time: 1min 10s
%%time
# extended dotaset xgbrf params from hyperopt
X = df_train_features_tr
y = y
xgb_mdl = xgboost.XGBRFClassifier(n_estimators=700, max_depth=7, colsample_bytree=0.76, gamma=1.16, min_child_weight=1, reg_alpha=10, reg_lambda=0.6, random_state=SEED)
xgb_scores_cb = cross_val_score(xgb_mdl, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {xgb_scores_cb.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.8399883282848781 CPU times: user 14min 13s, sys: 5.05 s, total: 14min 18s Wall time: 1min 54s
# extended dotaset slightly better than RF BEST HYPERPARAMS
xgb_mdl = xgboost.XGBRFClassifier(n_estimators=700, max_depth=7, colsample_bytree=0.76, gamma=1.16, min_child_weight=1, reg_alpha=10, reg_lambda=0.6, random_state=SEED)
xgb_mdl.fit(X, y)
X_test = df_test_features_tr
xgb_mdl
y_test_pred = xgb_mdl.predict_proba(X_test)[:, 1]
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
index=indexes)
submission_filename = "submission_{}.csv".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))
Файл посылки сохранен, как: submission_2023-04-09_21-58-52.csv
I forgot about learning rate!
space = {'max_depth': hp.quniform('max_depth', 1, 18, 1),
'gamma': hp.uniform ('gamma', 1, 9),
'eta': hp.uniform('eta', 0, 1),
'reg_alpha': hp.quniform('reg_alpha', 10, 100, 1),
'reg_lambda': hp.uniform('reg_lambda', 0, 1),
'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1),
'n_estimators': hp.quniform('n_estimators', 200, 700, 50),
'seed': SEED
}
def objective(space):
mdl = xgboost.XGBClassifier(n_estimators=int(space['n_estimators']),
max_depth=int(space['max_depth']),
reg_alpha=int(space['reg_alpha']),
min_child_weight=int(space['min_child_weight']),
colsample_bytree=space['colsample_bytree'],
gamma=space['gamma'],
reg_lambda=space['reg_lambda'],
eta=space['eta']
)
evaluation = [(X_tr, y_tr), (X_te, y_te)]
mdl.fit(X_tr, y_tr,
eval_set=evaluation, eval_metric="auc",
early_stopping_rounds=10,verbose=False)
pred = mdl.predict(X_te)
roc_auc = roc_auc_score(y_te, pred>0.5)
print ("SCORE:", roc_auc)
return {'loss': -roc_auc, 'status': STATUS_OK }
trials = Trials()
best_hyperparams = fmin(fn=objective,
space=space,
algo=tpe.suggest,
max_evals=300,
trials=trials)
SCORE: 0.7545164779884643 SCORE: 0.7564235006227082 SCORE: 0.7505171001209483 SCORE: 0.7463469294963352 SCORE: 0.7543098106906821 SCORE: 0.7537837275386794 SCORE: 0.7489704928159938 SCORE: 0.7697325521320993 SCORE: 0.7891597963754161 SCORE: 0.748070347037523 SCORE: 0.7669604232988502 SCORE: 0.7549033529508912 SCORE: 0.7992201577374186 SCORE: 0.7542893109531479 SCORE: 0.7533242685336631 SCORE: 0.7621437335642203 SCORE: 0.7684088218892973 SCORE: 0.751045745740143 SCORE: 0.7667520284950958 SCORE: 0.7594450237347804 SCORE: 0.7888497954287428 SCORE: 0.7906201723406024 SCORE: 0.7854439598214358 SCORE: 0.7830800406493672 SCORE: 0.7629567554581703 SCORE: 0.7769549954750861 SCORE: 0.7889736288151681 SCORE: 0.7545190404556562 SCORE: 0.7891264843019228 SCORE: 0.7948928416528869 SCORE: 0.8033638687288942 SCORE: 0.7792147460370437 SCORE: 0.8002321595275755 SCORE: 0.7544720810849843 SCORE: 0.7547018105874924 SCORE: 0.7798783098726505 SCORE: 0.7753579737875155 SCORE: 0.754696685653109 SCORE: 0.7588591688761468 SCORE: 0.7568488125930117 SCORE: 0.7576208350118934 SCORE: 0.7790652879787006 SCORE: 0.74943078678223 SCORE: 0.7538196020793644 SCORE: 0.760925899437501 SCORE: 0.7500824308266301 SCORE: 0.7497425152348751 SCORE: 0.7994396373711598 SCORE: 0.7665820706992184 SCORE: 0.7559358544785821 SCORE: 0.7601436271498525 SCORE: 0.7717608456855769 SCORE: 0.80250299491952 SCORE: 0.7678801762701026 SCORE: 0.8075246244459601 SCORE: 0.8080968320074149 SCORE: 0.7571511261381098 SCORE: 0.7539024359907212 SCORE: 0.8044851640635455 SCORE: 0.7538503516856657 SCORE: 0.7774887660286642 SCORE: 0.7747311194538133 SCORE: 0.7453246778374112 SCORE: 0.756916271701218 SCORE: 0.7607405668384728 SCORE: 0.8014610784842816 SCORE: 0.8104948982141965 SCORE: 0.8011510775376083 SCORE: 0.767294321411469 SCORE: 0.7987666586280056 SCORE: 0.7895731309709804 SCORE: 0.8016284738129672 SCORE: 0.7823788747671034 SCORE: 0.7749839111610475 SCORE: 0.7817187083599082 SCORE: 0.7907550905570147 SCORE: 0.7861467956261392 SCORE: 0.8080404577291957 SCORE: 0.760066753134099 SCORE: 0.8220933445183587 SCORE: 0.8188087798302852 SCORE: 0.7691936566441376 SCORE: 0.7534199147809787 SCORE: 0.80817710345158 SCORE: 0.764853528223647 SCORE: 0.7749762237594722 SCORE: 0.7761761206158491 SCORE: 0.754044206647489 SCORE: 0.7735850344084639 SCORE: 0.780663144627491 SCORE: 0.7580879814184851 SCORE: 0.7529886429150718 SCORE: 0.7550946454455226 SCORE: 0.7791899563263457 SCORE: 0.7760181401947107 SCORE: 0.8031844960254695 SCORE: 0.8049284133041914 SCORE: 0.7711801157613268 SCORE: 0.7592332315026147 SCORE: 0.7676350719644438 SCORE: 0.7541193531572705 SCORE: 0.7635528601855709 SCORE: 0.7593698772249988 SCORE: 0.7934743001239888 SCORE: 0.7802634573291054 SCORE: 0.7770506417224019 SCORE: 0.7536675815538295 SCORE: 0.7558683953703762 SCORE: 0.7575175013630022 SCORE: 0.7791224972181395 SCORE: 0.7693866766447411 SCORE: 0.7587865848335571 SCORE: 0.7643291725776162 SCORE: 0.752823810053578 SCORE: 0.7599403572804818 SCORE: 0.7952566544105876 SCORE: 0.7946818843819409 SCORE: 0.7621667957689462 SCORE: 0.7810730817946437 SCORE: 0.7561553341123233 SCORE: 0.7573757307062344 SCORE: 0.7558632704359926 SCORE: 0.7712988242133686 SCORE: 0.753325996039635 SCORE: 0.753533498298637 SCORE: 0.7709093867837501 SCORE: 0.8184910914820365 SCORE: 0.7992696795752823 SCORE: 0.7654103609819513 SCORE: 0.7601521495126475 SCORE: 0.8084922293326368 SCORE: 0.7582579392143625 SCORE: 0.7824591462112684 SCORE: 0.7960440516326198 SCORE: 0.7531637256453328 SCORE: 0.7524830595006033 SCORE: 0.7552517909054411 SCORE: 0.7751205568834317 SCORE: 0.7962968433398541 SCORE: 0.7847940432830077 SCORE: 0.8164781727317098 SCORE: 0.7843815436486631 SCORE: 0.7581187310247864 SCORE: 0.7728138469508022 SCORE: 0.7834421259010959 SCORE: 0.8206688430938572 SCORE: 0.806701352683243 SCORE: 0.8196158418286319 SCORE: 0.8259432643275314 SCORE: 0.8142286720385192 SCORE: 0.8103471676618254 SCORE: 0.8069182698497921 SCORE: 0.8090806466584621 SCORE: 0.8074622902721373 SCORE: 0.786964107493253 SCORE: 0.8129997530818132 SCORE: 0.7564773124337357 SCORE: 0.7739257849614387 SCORE: 0.8162253810244755 SCORE: 0.8102156468738246 SCORE: 0.8071633741554511 SCORE: 0.826170431362848 SCORE: 0.7601418996438805 SCORE: 0.75958935685874 SCORE: 0.8119732114497253 SCORE: 0.7932787176561935 SCORE: 0.7515538916218034 SCORE: 0.7690903229952466 SCORE: 0.7636536313672703 SCORE: 0.8045159136698469 SCORE: 0.821148801836408 SCORE: 0.7933820513050847 SCORE: 0.7803070192713656 SCORE: 0.7528391848567286 SCORE: 0.8068849577762991 SCORE: 0.8019769117675173 SCORE: 0.7645127776706725 SCORE: 0.7507152450559355 SCORE: 0.7889838786839353 SCORE: 0.7517656838539691 SCORE: 0.7553679368902911 SCORE: 0.7977905313785336 SCORE: 0.7871297753159666 SCORE: 0.7763025164694661 SCORE: 0.7544182692739569 SCORE: 0.8255871389714061 SCORE: 0.790566360529575 SCORE: 0.7721272209104694 SCORE: 0.778849205773371 SCORE: 0.8018120789060235 SCORE: 0.764667360663399 SCORE: 0.8147598801249056 SCORE: 0.7876276713288598 SCORE: 0.7548751658117815 SCORE: 0.7613776710409421 SCORE: 0.7925946540830525 SCORE: 0.7690920505012185 SCORE: 0.8069285197185594 SCORE: 0.8162586930979686 SCORE: 0.7685095930709966 SCORE: 0.8022322659419433 SCORE: 0.7559845413552261 SCORE: 0.7676709465051288 SCORE: 0.788009421356903 SCORE: 0.7626578393414839 SCORE: 0.796168719980265 SCORE: 0.8018505159139002 SCORE: 0.7640601711060115 SCORE: 0.7851424812375576 SCORE: 0.7822396665775273 SCORE: 0.758637126775214 SCORE: 0.8218559276142751 SCORE: 0.7741734517342893 SCORE: 0.7600949402732086 SCORE: 0.7944444674778574 SCORE: 0.7695549069346467 SCORE: 0.7925972165502442 SCORE: 0.8167130271686016 SCORE: 0.7922359662597352 SCORE: 0.7663395288607513 SCORE: 0.7586627514471318 SCORE: 0.7927210499366696 SCORE: 0.7856019402425741 SCORE: 0.7598113989596729 SCORE: 0.8035688085207043 SCORE: 0.8036072455285811 SCORE: 0.8085152915373628 SCORE: 0.7536727064882129 SCORE: 0.7761214738436018 SCORE: 0.7603716291463886 SCORE: 0.7888557553243462 SCORE: 0.773154597503777 SCORE: 0.8023663491971359 SCORE: 0.7519979758236691 SCORE: 0.8133148789628699 SCORE: 0.7725661801779515 SCORE: 0.7745398269591818 SCORE: 0.7518895172403943 SCORE: 0.765586278673432 SCORE: 0.7813805202741253 SCORE: 0.8131628584373349 SCORE: 0.7881435046120955 SCORE: 0.7815043536605506 SCORE: 0.7801729360161732 SCORE: 0.7950559470084089 SCORE: 0.7822422290447191 SCORE: 0.7572928967948774 SCORE: 0.821393906142067 SCORE: 0.803284432245949 SCORE: 0.7903443184286422 SCORE: 0.7622086302052347 SCORE: 0.7714303450013693 SCORE: 0.7590965857802303 SCORE: 0.7468755751155298 SCORE: 0.77921045606388 SCORE: 0.8033493288869632 SCORE: 0.7634802761429813 SCORE: 0.7559358544785821 SCORE: 0.8260619727795732 SCORE: 0.8215877611038902 SCORE: 0.8233965750236265 SCORE: 0.8128477325562782 SCORE: 0.8115914614216821 SCORE: 0.823275304104393 SCORE: 0.8073956661251511 SCORE: 0.8007180781657296 SCORE: 0.8087168339007613 SCORE: 0.8169222569335756 SCORE: 0.8207721767427483 SCORE: 0.7623042764525504 SCORE: 0.7894262353798291 SCORE: 0.818708008648586 SCORE: 0.8094393344817793 SCORE: 0.8033365165510045 SCORE: 0.7738908029655059 SCORE: 0.8268903694766739 SCORE: 0.8072948949434519 SCORE: 0.814218422169752 SCORE: 0.8129843782786623 SCORE: 0.8091942301761205 SCORE: 0.7863090660204411 SCORE: 0.7867762124270329 SCORE: 0.7603246697757167 SCORE: 0.7980005961047274 SCORE: 0.8122157532881925 SCORE: 0.7913947572266757 SCORE: 0.8040359549272962 SCORE: 0.821577511235123 SCORE: 0.7973968039757513 SCORE: 0.7678724888685273 SCORE: 0.8135292336622275 SCORE: 0.7618883793897943 SCORE: 0.8077107920062081 SCORE: 0.7742895977191394 SCORE: 0.7696146786412774 SCORE: 0.7908507368043305 SCORE: 0.8131338363370055 SCORE: 0.7620873592860011 SCORE: 0.8105077105501554 SCORE: 0.7571485636709179 100%|██████████| 300/300 [1:13:33<00:00, 14.71s/trial, best loss: -0.8268903694766739]
print(best_hyperparams)
{'colsample_bytree': 0.6390806383770113, 'eta': 0.057884932782664975, 'gamma': 1.34439452983124, 'max_depth': 12.0, 'min_child_weight': 1.0, 'n_estimators': 700.0, 'reg_alpha': 12.0, 'reg_lambda': 0.5157939591884695}
%%time
# extended dotaset xgbrf params from hyperopt + learning rate
X = df_train_features_tr
y = y
xgb_mdl = xgboost.XGBRFClassifier(n_estimators=700, max_depth=12, eta=0.0578, colsample_bytree=0.64, gamma=1.344, min_child_weight=1, reg_alpha=12, reg_lambda=0.516, random_state=SEED)
xgb_scores_cb = cross_val_score(xgb_mdl, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {xgb_scores_cb.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.8452646025565423 CPU times: user 21min 3s, sys: 8.38 s, total: 21min 11s Wall time: 2min 51s
# extended dotaset slightly better than RF
xgb_mdl = xgboost.XGBRFClassifier(n_estimators=700, max_depth=12, eta=0.0578, colsample_bytree=0.64, gamma=1.344, min_child_weight=1, reg_alpha=12, reg_lambda=0.516, random_state=SEED)
xgb_mdl.fit(X, y)
X_test = df_test_features_tr
xgb_mdl
y_test_pred = xgb_mdl.predict_proba(X_test)[:, 1]
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
index=indexes)
submission_filename = "submission_{}.csv".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))
Файл посылки сохранен, как: submission_2023-04-09_23-32-58.csv
%%time
# extended dotaset xgbrf params from hyperopt + learning rate BEST AS FAR
X = df_train_features_tr
y = y
xgb_mdl = xgboost.XGBRFClassifier(n_estimators=700, max_depth=12, eta=0.0578, colsample_bytree=0.64, gamma=1.344, min_child_weight=1, reg_alpha=12, reg_lambda=0.516, random_state=SEED)
xgb_scores_cb = cross_val_score(xgb_mdl, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {xgb_scores_cb.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.8449916005203267 CPU times: user 26min 5s, sys: 13.1 s, total: 26min 18s Wall time: 3min 41s
# extended dotaset slightly better than RF
xgb_mdl = xgboost.XGBRFClassifier(n_estimators=700, max_depth=12, eta=0.0578, colsample_bytree=0.64, gamma=1.344, min_child_weight=1, reg_alpha=12, reg_lambda=0.516, random_state=SEED)
xgb_mdl.fit(X, y)
X_test = df_test_features_tr
xgb_mdl
y_test_pred = xgb_mdl.predict_proba(X_test)[:, 1]
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
index=indexes)
submission_filename = "submission_{}.csv".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))
Файл посылки сохранен, как: submission_2023-04-10_00-50-57.csv
Try boosting XGBC
%%time
# extended dotaset xgbrf params from hyperopt + learning rate BEST AS FAR
X = df_train_features_tr
y = y
xgb_mdl = xgboost.XGBClassifier(
objective='binary:logistic',
eval_metric='auc',
n_estimators=500,
learning_rate=0.1,
max_depth=6,
min_child_weight=1,
gamma=0.1,
subsample=0.8,
colsample_bytree=0.8,
scale_pos_weight=1,
seed=SEED
)
xgb_scores_cb = cross_val_score(xgb_mdl, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {xgb_scores_cb.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.8970959320738435 CPU times: user 10min 16s, sys: 5.16 s, total: 10min 22s Wall time: 1min 27s
# extended dotaset slightly better than RF
xgb_mdl = xgboost.XGBClassifier(
objective='binary:logistic',
eval_metric='auc',
n_estimators=500,
learning_rate=0.1,
max_depth=6,
min_child_weight=1,
gamma=0.1,
subsample=0.8,
colsample_bytree=0.8,
scale_pos_weight=1,
seed=SEED
)
xgb_mdl.fit(X, y)
X_test = df_test_features_tr
xgb_mdl
y_test_pred = xgb_mdl.predict_proba(X_test)[:, 1]
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
index=indexes)
submission_filename = "submission_{}.csv".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))
Файл посылки сохранен, как: submission_2023-04-10_01-12-08.csv
Now i'll try catboost
%%time
# extended dotaset
X = df_train_features_tr
y = y
cat_mdl = catboost.CatBoostClassifier(
iterations=1000,
learning_rate=0.05,
depth=6,
l2_leaf_reg=1,
random_seed=SEED,
eval_metric="AUC",
verbose=0,
)
cv_scores_cb = cross_val_score(cat_mdl, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {cv_scores_cb.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.8785165370039281 CPU times: user 3min 12s, sys: 12.8 s, total: 3min 25s Wall time: 41.3 s
# extended dotaset is worse than RF
cat_mdl = catboost.CatBoostClassifier(
iterations=1000,
learning_rate=0.05,
depth=6,
l2_leaf_reg=1,
random_seed=SEED,
eval_metric="AUC",
verbose=0,
)
cat_mdl.fit(X, y)
X_test = df_test_features_tr
cat_mdl
y_test_pred = cat_mdl.predict_proba(X_test)[:, 1]
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
index=indexes)
submission_filename = "submission_{}.csv".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))
Файл посылки сохранен, как: submission_2023-04-10_00-57-25.csv
Best model
%%time
# extended dotaset BEST MODEL AS FAR
X = df_train_features_tr
y = y
cat_mdl = catboost.CatBoostClassifier(
iterations=1000,
learning_rate=0.01,
depth=6,
l2_leaf_reg=3,
border_count=254,
bagging_temperature=1,
random_strength=1,
random_seed=SEED,
eval_metric="AUC",
verbose=0,
)
cv_scores_cb = cross_val_score(cat_mdl, X, y, cv=cv, scoring="roc_auc")
print(f"Среднее значение ROC-AUC на кросс-валидации: {cv_scores_cb.mean()}")
Среднее значение ROC-AUC на кросс-валидации: 0.8504842299286703 CPU times: user 3min 12s, sys: 13.1 s, total: 3min 25s Wall time: 43.4 s
# extended dotaset is worse than RF BEST AS FAR
cat_mdl = catboost.CatBoostClassifier(
iterations=1000,
learning_rate=0.01,
depth=6,
l2_leaf_reg=3,
border_count=254,
bagging_temperature=1,
random_strength=1,
random_seed=SEED,
eval_metric="AUC",
verbose=0,
)
cat_mdl.fit(X, y)
X_test = df_test_features_tr
cat_mdl
y_test_pred = cat_mdl.predict_proba(X_test)[:, 1]
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
index=indexes)
submission_filename = "submission_{}.csv".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))
Файл посылки сохранен, как: submission_2023-04-10_01-00-41.csv
mdl = catboost.CatBoostClassifier(
iterations=1000,
learning_rate=0.01,
depth=6,
l2_leaf_reg=3,
border_count=254,
bagging_temperature=1,
random_strength=1,
random_seed=SEED,
eval_metric="AUC",
verbose=0,
)
mdl.fit(X_tr, y_tr)
y_pr_tr = mdl.predict(X_tr)
y_pr_te = mdl.predict(X_te)
roc_auc_score(y_te, y_pr_te)
conf_mtr(mdl, X_tr, y_tr, y_pr_tr, X_te, y_te, y_pr_te)
def conf_mtr(best_mdl, X_train, y_train, y_pred_train, X_test, y_test, y_pred_test):
"""just in case wrote a funtion to demonstrate confusion matrix and roc-auc curve for train/test validation"""
fig, ax = plt.subplots(2, 2, figsize=(14, 10))
ax[0][1].set_title("train", size=14, weight='bold')
ax[1][1].set_title("test", size=14, weight='bold')
cm_train = confusion_matrix(y_train, y_pred_train, labels=best_mdl.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_train,
display_labels=best_mdl.classes_)
disp.plot(ax=ax[0][1], cmap=sns.color_palette("blend:aliceblue,steelblue", as_cmap=True))
cm_test = confusion_matrix(y_test, y_pred_test, labels=best_mdl.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_test,
display_labels=best_mdl.classes_)
disp.plot(ax=ax[1][1], cmap=sns.color_palette("blend:aliceblue,steelblue", as_cmap=True))
ax[0][0].set_title("train", size=14, weight='bold')
ax[1][0].set_title("test", size=14, weight='bold')
RocCurveDisplay.from_estimator(best_mdl, X_train, y_train, ax=ax[0][0], color='steelblue')
ax[0][0].plot([0, 1], linestyle='--', color='lightskyblue', linewidth=0.7)
RocCurveDisplay.from_estimator(best_mdl, X_test, y_test, ax=ax[1][0], color='steelblue')
ax[1][0].plot([0, 1], linestyle='--', color='lightskyblue', linewidth=0.7)
title = (f"{str(best_mdl.get_params)[42:-1]}\n"
f"accuracy score on test {accuracy_score(y_test, y_pr_te)}\n"
f"f1 score on test {f1_score(y_test, y_pr_te)}")
fig.suptitle(title, weight="bold", size=16)
plt.subplots_adjust(hspace=0.3)